diff --git a/dygie/tests/data/annotated_doc_test.py b/dygie/tests/data/annotated_doc_test.py index 9897c19..0809ffc 100644 --- a/dygie/tests/data/annotated_doc_test.py +++ b/dygie/tests/data/annotated_doc_test.py @@ -16,6 +16,8 @@ import annotated_doc as ad import spacy +verboseprint = print + class TestEnt(unittest.TestCase): def setUp(self): @@ -505,10 +507,692 @@ def test_format_dygiepp(self): res = annotated_doc.format_dygiepp() self.assertEqual(res, self.dygiepp_dict) + +class TestQualityCheck(unittest.TestCase): + """ + Tests quality_check_sent_splits and its helper, merge_mult_sent_splits. + """ + def setUp(self): + + self.maxDiff = None + + # Helper input/output + self.no_merge_no_multi = [(1, 2), (5, 6)] + self.no_merge_no_multi_answer = [(1, 2), (5, 6)] + self.no_merge_yes_multi = [(1, 4), (5, 6)] + self.no_merge_yes_multi_answer = [(1, 4), (5, 6)] + self.yes_merge_no_multi = [(2, 3), (3, 4), (4, 5), (6, 7), (8, 9), + (9, 10)] + self.yes_merge_no_multi_answer = [(2, 5), (6, 7), (8, 10)] + self.yes_merge_yes_multi_no_overlap = [(2, 5), (6, 7), (8, 9), (9, 10)] + self.yes_merge_yes_multi_no_overlap_answer = [(2, 5), (6, 7), (8, 10)] + self.yes_merge_yes_multi_yes_overlap = [(2, 9), (8, 9), (9, 10)] + self.only_one_merge_pair = [(5,6), (6,7)] + self.only_one_merge_pair_answer = [(5,7)] + + # Main func input/output + # I'm using real instances for these, may not cover all possible instances + # but I've done my best to account for all of those in the helper + self.bioinfer_single = { + "doc_key": + "BioInfer.d70.s0", + "dataset": + "bioinfer_ppi", + "sentences": + [[ + "Aprotinin", "inhibited", "platelet", "aggregation", "induced", + "by", "thrombin", "(", "0.25", "U.ml-1", ")", "with", "IC50", + "200", "kIU.ml-1", ",", "and", "inhibited", "the", "rise", + "of", "cytosolic", "free", "calcium", "concentration", "in", + "platelets", "stimulated", "by", "thrombin", "(", "0.1", + "U.ml-1", ")", "in", "the", "absence", "and", "in", "the", + "presence", "of", "Ca2", "+", "0.5", "mmol", "." + ], + [ + "L-1", "(", "IC50", "117", "and", "50", "kIU.ml-1", ",", + "respectively", ")", ",", "but", "had", "no", "effect", "on", + "the", "amounts", "of", "actin", "and", "myosin", "heavy", + "chain", "associated", "with", "cytoskeletons", "." + ]], + "ner": [[[29, 29, "Individual_protein"], + [0, 0, "Individual_protein"], + [6, 6, "Individual_protein"]], + [[68, 70, "Individual_protein"], + [66, 66, "Individual_protein"]]], + "relations": [[[29, 29, 0, 0, "PPI"], [0, 0, 66, 66, "PPI"]], + [[68, 70, 0, 0, "PPI"]]] + } + self.bioinfer_single_answer = { + "doc_key": + "BioInfer.d70.s0", + "dataset": + "bioinfer_ppi", + "sentences": [[ + "Aprotinin", "inhibited", "platelet", "aggregation", "induced", + "by", "thrombin", "(", "0.25", "U.ml-1", ")", "with", "IC50", + "200", "kIU.ml-1", ",", "and", "inhibited", "the", "rise", + "of", "cytosolic", "free", "calcium", "concentration", "in", + "platelets", "stimulated", "by", "thrombin", "(", "0.1", + "U.ml-1", ")", "in", "the", "absence", "and", "in", "the", + "presence", "of", "Ca2", "+", "0.5", "mmol", ".", "L-1", "(", + "IC50", "117", "and", "50", "kIU.ml-1", ",", "respectively", + ")", ",", "but", "had", "no", "effect", "on", "the", "amounts", + "of", "actin", "and", "myosin", "heavy", "chain", "associated", + "with", "cytoskeletons", "." + ]], + "ner": [[[29, 29, "Individual_protein"], + [0, 0, "Individual_protein"], + [6, 6, "Individual_protein"], + [68, 70, "Individual_protein"], + [66, 66, "Individual_protein"]]], + "relations": [[[29, 29, 0, 0, "PPI"], [0, 0, 66, 66, "PPI"], + [68, 70, 0, 0, "PPI"]]] + } + + self.pickle_mult_no_overlaps_or_merges = { + "doc_key": + "PMID12825696_abstract", + "dataset": + "pickle", + "sentences": + [[ + "Ca2", "+", "and", "calmodulin", "(", "CaM", ")", ",", "a", + "key", "Ca2", "+", "sensor", "in", "all", "eukaryotes", ",", + "have", "been", "implicated", "in", "defense", "responses", + "in", "plants", "." + ], + [ + "To", "elucidate", "the", "role", "of", "Ca2", "+", "and", + "CaM", "in", "defense", "signaling", ",", "we", "used", + "35S-labeled", "CaM", "to", "screen", "expression", + "libraries", "prepared", "from", "tissues", "that", "were", + "either", "treated", "with", "an", "elicitor", "derived", + "from", "Phytophthora", "megasperma", "or", "infected", + "with", "Pseudomonas", "syringae", "pv", "." + ], ["tabaci", "."], + [ + "Nineteen", "cDNAs", "that", "encode", "the", "same", + "protein", ",", "pathogen-induced", "CaM-binding", "protein", + "(", "PICBP", ")", ",", "were", "isolated", "." + ], + [ + "The", "PICBP", "fusion", "proteins", "bound", "35S-CaM", ",", + "horseradish", "peroxidase-labeled", "CaM", "and", + "CaM-Sepharose", "in", "the", "presence", "of", "Ca2", "+", + "whereas", "EGTA", ",", "a", "Ca2", "+", "chelator", ",", + "abolished", "binding", ",", "confirming", "that", "PICBP", + "binds", "CaM", "in", "a", "Ca2", "+", "-dependent", "manner", + "." + ], + [ + "Using", "a", "series", "of", "bacterially", "expressed", + "truncated", "versions", "of", "PICBP", ",", "four", + "CaM-binding", "domains", ",", "with", "a", "potential", + "CaM-binding", "consensus", "sequence", "of", + "WSNLKKVILLKRFVKSL", ",", "were", "identified", "." + ], + [ + "The", "deduced", "PICBP", "protein", "sequence", "is", + "rich", "in", "leucine", "residues", "and", "contains", + "three", "classes", "of", "repeats", "." + ], + [ + "The", "PICBP", "gene", "is", "differentially", "expressed", + "in", "tissues", "with", "the", "highest", "expression", "in", + "stem", "." + ], + [ + "The", "expression", "of", "PICBP", "in", "Arabidopsis", + "was", "induced", "in", "response", "to", "avirulent", + "Pseudomonas", "syringae", "pv", "." + ], ["tomato", "carrying", "avrRpm1", "."], + [ + "Furthermore", ",", "PICBP", "is", "constitutively", + "expressed", "in", "the", "Arabidopsis", "accelerated", + "cell", "death2", "-", "2", "mutant", "." + ], + [ + "The", "expression", "of", "PICBP", "in", "bean", "leaves", + "was", "also", "induced", "after", "inoculation", "with", + "avirulent", "and", "non-pathogenic", "bacterial", "strains", + "." + ], + [ + "In", "addition", ",", "the", "hrp1", "mutant", "of", + "Pseudomonas", "syringae", "pv", "." + ], + [ + "tabaci", "and", "inducers", "of", "plant", "defense", "such", + "as", "salicylic", "acid", ",", "hydrogen", "peroxide", "and", + "a", "fungal", "elicitor", "induced", "PICBP", "expression", + "in", "bean", "." + ], + [ + "Our", "data", "suggest", "a", "role", "for", "PICBP", "in", + "Ca2", "+", "-mediated", "defense", "signaling", "and", + "cell-death", "." + ], + [ + "Furthermore", ",", "PICBP", "is", "the", "first", + "identified", "CBP", "in", "eukaryotes", "with", "four", + "Ca2", "+", "-dependent", "CaM-binding", "domains", "." + ]], + "ner": [[[3, 3, "Protein"], [5, 5, "Protein"], [0, 1, "Element"], + [10, 12, "Protein"]], + [[34, 34, "Protein"], [59, 60, "Multicellular_organism"], + [31, 32, "Element"], [41, 42, "Protein"], + [64, 68, "Unicellular_organism"]], [], + [[82, 82, "Protein"], [78, 80, "Protein"]], + [[107, 107, "Organic_compound_other"], [93, 93, "Protein"], + [95, 97, "Protein"], [99, 99, "Protein"], + [104, 105, "Element"], [119, 119, "Protein"], + [121, 121, "Protein"], [89, 91, "Protein"], + [110, 112, "Organic_compound_other"]], + [[138, 138, "Protein"], [141, 142, "Peptide"], + [151, 151, "Peptide"], [147, 149, "Peptide"]], + [[164, 165, "Amino_acid_monomer"], [158, 159, "Protein"]], + [[174, 175, "DNA"]], + [[193, 193, "Multicellular_organism"], [191, 191, "DNA"], + [199, 204, "Unicellular_organism"]], [[206, 206, "DNA"]], + [[210, 210, "Protein"], + [216, 222, "Multicellular_organism"]], + [[229, 230, "Plant_region"], [227, 227, "DNA"]], + [[247, 248, "Unicellular_organism"], + [250, 254, "Unicellular_organism"]], + [[265, 266, "Inorganic_compound_other"], + [262, 263, "Plant_hormone"], + [272, 273, "Biochemical_process"], + [275, 275, "Multicellular_organism"]], + [[285, 291, "Biochemical_process"], [283, 283, "DNA"]], + [[295, 295, "Protein"], [305, 309, "Peptide"], + [300, 300, "Protein"]]], + "relations": [[], [], [], [], + [[119, 119, 121, 121, "interacts"], + [89, 91, 93, 93, "interacts"], + [89, 91, 95, 97, "interacts"], + [89, 91, 99, 99, "interacts"], + [89, 91, 104, 105, "interacts"], + [107, 107, 89, 91, "inhibits"], + [107, 107, 104, 105, "inhibits"]], [], + [[164, 165, 158, 159, "is-in"]], [], + [[191, 191, 193, 193, "is-in"], + [199, 204, 191, 191, "activates"]], + [[206, 206, 199, 204, "is-in"]], + [[210, 210, 216, 222, "is-in"]], + [[227, 227, 229, 230, "is-in"]], + [[247, 248, 272, 273, "activates"]], + [[262, 263, 272, 273, "activates"], + [265, 266, 272, 273, "activates"], + [272, 273, 275, 275, "is-in"]], + [[283, 283, 285, 291, "is-in"]], + [[305, 309, 295, 295, "is-in"]]] + } + + self.pickle_mult_no_overlaps_or_merges_answer = { + "doc_key": + "PMID12825696_abstract", + "dataset": + "pickle", + "sentences": + [[ + "Ca2", "+", "and", "calmodulin", "(", "CaM", ")", ",", "a", + "key", "Ca2", "+", "sensor", "in", "all", "eukaryotes", ",", + "have", "been", "implicated", "in", "defense", "responses", + "in", "plants", "." + ], + [ + "To", "elucidate", "the", "role", "of", "Ca2", "+", "and", + "CaM", "in", "defense", "signaling", ",", "we", "used", + "35S-labeled", "CaM", "to", "screen", "expression", + "libraries", "prepared", "from", "tissues", "that", "were", + "either", "treated", "with", "an", "elicitor", "derived", + "from", "Phytophthora", "megasperma", "or", "infected", + "with", "Pseudomonas", "syringae", "pv", ".", "tabaci", "." + ], + [ + "Nineteen", "cDNAs", "that", "encode", "the", "same", + "protein", ",", "pathogen-induced", "CaM-binding", "protein", + "(", "PICBP", ")", ",", "were", "isolated", "." + ], + [ + "The", "PICBP", "fusion", "proteins", "bound", "35S-CaM", ",", + "horseradish", "peroxidase-labeled", "CaM", "and", + "CaM-Sepharose", "in", "the", "presence", "of", "Ca2", "+", + "whereas", "EGTA", ",", "a", "Ca2", "+", "chelator", ",", + "abolished", "binding", ",", "confirming", "that", "PICBP", + "binds", "CaM", "in", "a", "Ca2", "+", "-dependent", "manner", + "." + ], + [ + "Using", "a", "series", "of", "bacterially", "expressed", + "truncated", "versions", "of", "PICBP", ",", "four", + "CaM-binding", "domains", ",", "with", "a", "potential", + "CaM-binding", "consensus", "sequence", "of", + "WSNLKKVILLKRFVKSL", ",", "were", "identified", "." + ], + [ + "The", "deduced", "PICBP", "protein", "sequence", "is", + "rich", "in", "leucine", "residues", "and", "contains", + "three", "classes", "of", "repeats", "." + ], + [ + "The", "PICBP", "gene", "is", "differentially", "expressed", + "in", "tissues", "with", "the", "highest", "expression", "in", + "stem", "." + ], + [ + "The", "expression", "of", "PICBP", "in", "Arabidopsis", + "was", "induced", "in", "response", "to", "avirulent", + "Pseudomonas", "syringae", "pv", ".", "tomato", "carrying", + "avrRpm1", "." + ], + [ + "Furthermore", ",", "PICBP", "is", "constitutively", + "expressed", "in", "the", "Arabidopsis", "accelerated", + "cell", "death2", "-", "2", "mutant", "." + ], + [ + "The", "expression", "of", "PICBP", "in", "bean", "leaves", + "was", "also", "induced", "after", "inoculation", "with", + "avirulent", "and", "non-pathogenic", "bacterial", "strains", + "." + ], + [ + "In", "addition", ",", "the", "hrp1", "mutant", "of", + "Pseudomonas", "syringae", "pv", ".", "tabaci", "and", + "inducers", "of", "plant", "defense", "such", "as", + "salicylic", "acid", ",", "hydrogen", "peroxide", "and", "a", + "fungal", "elicitor", "induced", "PICBP", "expression", "in", + "bean", "." + ], + [ + "Our", "data", "suggest", "a", "role", "for", "PICBP", "in", + "Ca2", "+", "-mediated", "defense", "signaling", "and", + "cell-death", "." + ], + [ + "Furthermore", ",", "PICBP", "is", "the", "first", + "identified", "CBP", "in", "eukaryotes", "with", "four", + "Ca2", "+", "-dependent", "CaM-binding", "domains", "." + ]], + "ner": [[[3, 3, "Protein"], [5, 5, "Protein"], [0, 1, "Element"], + [10, 12, "Protein"]], + [[34, 34, "Protein"], [59, 60, "Multicellular_organism"], + [31, 32, "Element"], [41, 42, "Protein"], + [64, 68, "Unicellular_organism"]], + [[82, 82, "Protein"], [78, 80, "Protein"]], + [[107, 107, "Organic_compound_other"], [93, 93, "Protein"], + [95, 97, "Protein"], [99, 99, "Protein"], + [104, 105, "Element"], [119, 119, "Protein"], + [121, 121, "Protein"], [89, 91, "Protein"], + [110, 112, "Organic_compound_other"]], + [[138, 138, "Protein"], [141, 142, "Peptide"], + [151, 151, "Peptide"], [147, 149, "Peptide"]], + [[164, 165, "Amino_acid_monomer"], [158, 159, "Protein"]], + [[174, 175, "DNA"]], + [[193, 193, "Multicellular_organism"], [191, 191, "DNA"], + [199, 204, "Unicellular_organism"], [206, 206, "DNA"]], + [[210, 210, "Protein"], + [216, 222, "Multicellular_organism"]], + [[229, 230, "Plant_region"], [227, 227, "DNA"]], + [[247, 248, "Unicellular_organism"], + [250, 254, "Unicellular_organism"], + [265, 266, "Inorganic_compound_other"], + [262, 263, "Plant_hormone"], + [272, 273, "Biochemical_process"], + [275, 275, "Multicellular_organism"]], + [[285, 291, "Biochemical_process"], [283, 283, "DNA"]], + [[295, 295, "Protein"], [305, 309, "Peptide"], + [300, 300, "Protein"]]], + "relations": [[], [], [], + [[119, 119, 121, 121, "interacts"], + [89, 91, 93, 93, "interacts"], + [89, 91, 95, 97, "interacts"], + [89, 91, 99, 99, "interacts"], + [89, 91, 104, 105, "interacts"], + [107, 107, 89, 91, "inhibits"], + [107, 107, 104, 105, "inhibits"]], [], + [[164, 165, 158, 159, "is-in"]], [], + [[191, 191, 193, 193, "is-in"], + [199, 204, 191, 191, "activates"], + [206, 206, 199, 204, "is-in"]], + [[210, 210, 216, 222, "is-in"]], + [[227, 227, 229, 230, "is-in"]], + [[247, 248, 272, 273, "activates"], + [262, 263, 272, 273, "activates"], + [265, 266, 272, 273, "activates"], + [272, 273, 275, 275, "is-in"]], + [[283, 283, 285, 291, "is-in"]], + [[305, 309, 295, 295, "is-in"]]] + } + + self.pickle_mult_subsequent_merges = { + "doc_key": + "PMID28911019_abstract", + "dataset": + "pickle", + "sentences": + [[ + "BACKGROUND", "AND", "AIMS", ":", "Selected", "beneficial", + "Pseudomonas", "spp", ".", "strains", "have", "the", "ability", + "to", "influence", "root", "architecture", "in", "Arabidopsis", + "thaliana", "by", "inhibiting", "primary", "root", "elongation", + "and", "promoting", "lateral", "root", "and", "root", "hair", + "formation", "." + ], + [ + "A", "crucial", "role", "for", "auxin", "in", "this", "long-term", + "(", "1week", ")", ",", "long-distance", "plant-microbe", + "interaction", "has", "been", "demonstrated", "." + ], + [ + "METHODS", ":", "Arabidopsis", "seedlings", "were", "cultivated", + "in", "vitro", "on", "vertical", "plates", "and", "inoculated", + "with", "pathogenic", "strains", "Pseudomonas", "syringae", "pv", + "." + ], + ["maculicola", "(", "Psm", ")", "and", "P.", "syringae", "pv", "."], + [ + "tomato", "DC3000", "(", "Pst", ")", ",", "as", "well", "as", + "Agrobacterium", "tumefaciens", "(", "Atu", ")", "and", + "Escherichia", "coli", "(", "Eco", ")", "." + ], + [ + "Root", "hair", "lengths", "were", "measured", "after", "24", + "and", "48h", "of", "direct", "exposure", "to", "each", + "bacterial", "strain", "." + ], + [ + "Several", "Arabidopsis", "mutants", "with", "impaired", + "responses", "to", "pathogens", ",", "impaired", "ethylene", + "perception", "and", "defects", "in", "the", "exocyst", "vesicle", + "tethering", "complex", "that", "is", "involved", "in", + "secretion", "were", "also", "analysed", "." + ], + [ + "KEY", "RESULTS", ":", "Arabidopsis", "seedling", "roots", + "infected", "with", "Psm", "or", "Pst", "responded", "similarly", + "to", "when", "infected", "with", "plant", "growth-promoting", + "rhizobacteria", ";", "root", "hair", "growth", "was", + "stimulated", "and", "primary", "root", "growth", "was", + "inhibited", "." + ], + [ + "Other", "plant-", "and", "soil-adapted", "bacteria", "induced", + "similar", "root", "hair", "responses", "." + ], + [ + "The", "most", "compromised", "root", "hair", "growth", + "stimulation", "response", "was", "found", "for", "the", + "knockout", "mutants", "exo70A1", "and", "ein2", "." + ], + [ + "The", "single", "immune", "pathways", "dependent", "on", + "salicylic", "acid", ",", "jasmonic", "acid", "and", "PAD4", + "are", "not", "directly", "involved", "in", "root", "hair", + "growth", "stimulation", ";", "however", ",", "in", "the", + "mutual", "cross-talk", "with", "ethylene", ",", "they", + "indirectly", "modify", "the", "extent", "of", "the", + "stimulation", "of", "root", "hair", "growth", "." + ], + [ + "The", "Flg22", "peptide", "does", "not", "initiate", "root", + "hair", "stimulation", "as", "intact", "bacteria", "do", ",", + "but", "pretreatment", "with", "Flg22", "prior", "to", "Psm", + "inoculation", "abolished", "root", "hair", "growth", + "stimulation", "in", "an", "FLS2", "receptor", "kinase-dependent", + "manner", "." + ], + [ + "These", "early", "response", "phenomena", "are", "not", + "associated", "with", "changes", "in", "auxin", "levels", ",", + "as", "monitored", "with", "the", "pDR5::GUS", "auxin", + "reporter", "." + ], + [ + "CONCLUSIONS", ":", "Early", "stimulation", "of", "root", "hair", + "growth", "is", "an", "effect", "of", "an", "unidentified", + "component", "of", "living", "plant", "pathogenic", "bacteria", + "." + ], + [ + "The", "root", "hair", "growth", "response", "is", "triggered", + "in", "the", "range", "of", "hours", "after", "bacterial", + "contact", "with", "roots", "and", "can", "be", "modulated", "by", + "FLS2", "signalling", "." + ], + [ + "Bacterial", "stimulation", "of", "root", "hair", "growth", + "requires", "functional", "ethylene", "signalling", "and", "an", + "efficient", "exocyst-dependent", "secretory", "machinery", "." + ]], + "ner": [[[6, 9, "Unicellular_organism"], + [18, 19, "Multicellular_organism"]], + [[38, 38, "Plant_hormone"]], + [[55, 56, "Multicellular_organism"], + [69, 73, "Unicellular_organism"]], + [[75, 75, "Unicellular_organism"], + [78, 83, "Unicellular_organism"]], + [[85, 85, "Unicellular_organism"], + [91, 92, "Unicellular_organism"], + [94, 94, "Unicellular_organism"], + [97, 98, "Unicellular_organism"], + [100, 100, "Unicellular_organism"]], [], + [[121, 122, "Multicellular_organism"], + [130, 131, "Biochemical_process"]], + [[152, 154, "Plant_region"], + [157, 157, "Unicellular_organism"], + [159, 159, "Unicellular_organism"]], [], + [[207, 207, "Multicellular_organism"], + [209, 209, "Multicellular_organism"]], + [[217, 218, "Plant_hormone"], [220, 221, "Plant_hormone"], + [223, 223, "Protein"], [241, 241, "Plant_hormone"]], + [[257, 258, "Peptide"], [273, 273, "Peptide"], + [276, 276, "Unicellular_organism"]], + [[300, 300, "Plant_hormone"], [307, 309, "DNA"]], [], + [[354, 355, "Biochemical_process"]], + [[365, 366, "Biochemical_process"]]], + "relations": [[[6, 9, 18, 19, "interacts"]], [], [], [], [], [], [], + [[157, 157, 152, 154, "interacts"], + [159, 159, 152, 154, "interacts"]], [], [], + [[217, 218, 241, 241, "interacts"], + [220, 221, 241, 241, "interacts"], + [223, 223, 241, 241, "interacts"]], [], [], [], [], []] + } + + self.pickle_mult_subsequent_merges_answer = { + "doc_key": + "PMID28911019_abstract", + "dataset": + "pickle", + "sentences": + [[ + "BACKGROUND", "AND", "AIMS", ":", "Selected", "beneficial", + "Pseudomonas", "spp", ".", "strains", "have", "the", "ability", + "to", "influence", "root", "architecture", "in", "Arabidopsis", + "thaliana", "by", "inhibiting", "primary", "root", "elongation", + "and", "promoting", "lateral", "root", "and", "root", "hair", + "formation", "." + ], + [ + "A", "crucial", "role", "for", "auxin", "in", "this", "long-term", + "(", "1week", ")", ",", "long-distance", "plant-microbe", + "interaction", "has", "been", "demonstrated", "." + ], + [ + "METHODS", ":", "Arabidopsis", "seedlings", "were", "cultivated", + "in", "vitro", "on", "vertical", "plates", "and", "inoculated", + "with", "pathogenic", "strains", "Pseudomonas", "syringae", "pv", + ".", "maculicola", "(", "Psm", ")", "and", "P.", "syringae", "pv", + ".", "tomato", "DC3000", "(", "Pst", ")", ",", "as", "well", "as", + "Agrobacterium", "tumefaciens", "(", "Atu", ")", "and", + "Escherichia", "coli", "(", "Eco", ")", "." + ], + [ + "Root", "hair", "lengths", "were", "measured", "after", "24", + "and", "48h", "of", "direct", "exposure", "to", "each", + "bacterial", "strain", "." + ], + [ + "Several", "Arabidopsis", "mutants", "with", "impaired", + "responses", "to", "pathogens", ",", "impaired", "ethylene", + "perception", "and", "defects", "in", "the", "exocyst", "vesicle", + "tethering", "complex", "that", "is", "involved", "in", + "secretion", "were", "also", "analysed", "." + ], + [ + "KEY", "RESULTS", ":", "Arabidopsis", "seedling", "roots", + "infected", "with", "Psm", "or", "Pst", "responded", "similarly", + "to", "when", "infected", "with", "plant", "growth-promoting", + "rhizobacteria", ";", "root", "hair", "growth", "was", + "stimulated", "and", "primary", "root", "growth", "was", + "inhibited", "." + ], + [ + "Other", "plant-", "and", "soil-adapted", "bacteria", "induced", + "similar", "root", "hair", "responses", "." + ], + [ + "The", "most", "compromised", "root", "hair", "growth", + "stimulation", "response", "was", "found", "for", "the", + "knockout", "mutants", "exo70A1", "and", "ein2", "." + ], + [ + "The", "single", "immune", "pathways", "dependent", "on", + "salicylic", "acid", ",", "jasmonic", "acid", "and", "PAD4", + "are", "not", "directly", "involved", "in", "root", "hair", + "growth", "stimulation", ";", "however", ",", "in", "the", + "mutual", "cross-talk", "with", "ethylene", ",", "they", + "indirectly", "modify", "the", "extent", "of", "the", + "stimulation", "of", "root", "hair", "growth", "." + ], + [ + "The", "Flg22", "peptide", "does", "not", "initiate", "root", + "hair", "stimulation", "as", "intact", "bacteria", "do", ",", + "but", "pretreatment", "with", "Flg22", "prior", "to", "Psm", + "inoculation", "abolished", "root", "hair", "growth", + "stimulation", "in", "an", "FLS2", "receptor", "kinase-dependent", + "manner", "." + ], + [ + "These", "early", "response", "phenomena", "are", "not", + "associated", "with", "changes", "in", "auxin", "levels", ",", + "as", "monitored", "with", "the", "pDR5::GUS", "auxin", + "reporter", "." + ], + [ + "CONCLUSIONS", ":", "Early", "stimulation", "of", "root", "hair", + "growth", "is", "an", "effect", "of", "an", "unidentified", + "component", "of", "living", "plant", "pathogenic", "bacteria", + "." + ], + [ + "The", "root", "hair", "growth", "response", "is", "triggered", + "in", "the", "range", "of", "hours", "after", "bacterial", + "contact", "with", "roots", "and", "can", "be", "modulated", "by", + "FLS2", "signalling", "." + ], + [ + "Bacterial", "stimulation", "of", "root", "hair", "growth", + "requires", "functional", "ethylene", "signalling", "and", "an", + "efficient", "exocyst-dependent", "secretory", "machinery", "." + ]], + "ner": [[[6, 9, "Unicellular_organism"], + [18, 19, "Multicellular_organism"]], + [[38, 38, "Plant_hormone"]], + [[55, 56, "Multicellular_organism"], + [69, 73, "Unicellular_organism"], + [75, 75, "Unicellular_organism"], + [78, 83, "Unicellular_organism"], + [85, 85, "Unicellular_organism"], + [91, 92, "Unicellular_organism"], + [94, 94, "Unicellular_organism"], + [97, 98, "Unicellular_organism"], + [100, 100, "Unicellular_organism"]], [], + [[121, 122, "Multicellular_organism"], + [130, 131, "Biochemical_process"]], + [[152, 154, "Plant_region"], + [157, 157, "Unicellular_organism"], + [159, 159, "Unicellular_organism"]], [], + [[207, 207, "Multicellular_organism"], + [209, 209, "Multicellular_organism"]], + [[217, 218, "Plant_hormone"], [220, 221, "Plant_hormone"], + [223, 223, "Protein"], [241, 241, "Plant_hormone"]], + [[257, 258, "Peptide"], [273, 273, "Peptide"], + [276, 276, "Unicellular_organism"]], + [[300, 300, "Plant_hormone"], [307, 309, "DNA"]], [], + [[354, 355, "Biochemical_process"]], + [[365, 366, "Biochemical_process"]]], + "relations": [[[6, 9, 18, 19, "interacts"]], [], [], [], [], + [[157, 157, 152, 154, "interacts"], + [159, 159, 152, 154, "interacts"]], [], [], + [[217, 218, 241, 241, "interacts"], + [220, 221, 241, 241, "interacts"], + [223, 223, 241, 241, "interacts"]], [], [], [], [], []] + } + + def test_merge_mult_splits_no_merge_no_multi(self): + + result = ad.AnnotatedDoc.merge_mult_splits(self.no_merge_no_multi) + + self.assertEqual(result, self.no_merge_no_multi_answer) + + def test_merge_mult_splits_no_merge_yes_multi(self): + + result = ad.AnnotatedDoc.merge_mult_splits(self.no_merge_yes_multi) + + self.assertEqual(result, self.no_merge_yes_multi_answer) + + def test_merge_mult_splits_yes_merge_no_multi(self): + + result = ad.AnnotatedDoc.merge_mult_splits(self.yes_merge_no_multi) + + self.assertEqual(result, self.yes_merge_no_multi_answer) + + def test_merge_mult_splits_yes_merge_yes_multi_no_overlap(self): + + result = ad.AnnotatedDoc.merge_mult_splits( + self.yes_merge_yes_multi_no_overlap) + + self.assertEqual(result, self.yes_merge_yes_multi_no_overlap_answer) + + def test_merge_mult_splits_yes_merge_yes_multi_yes_overlap(self): + + self.assertRaises(AssertionError, ad.AnnotatedDoc.merge_mult_splits, + self.yes_merge_yes_multi_yes_overlap) + + def test_merge_mult_splits_only_one_merge_pair(self): + + result = ad.AnnotatedDoc.merge_mult_splits(self.only_one_merge_pair) + + self.assertEqual(result, self.only_one_merge_pair_answer) + + def test_quality_check_bioinfer_single(self): + + result = ad.AnnotatedDoc.quality_check_sent_splits( + self.bioinfer_single) + + self.assertEqual(result, self.bioinfer_single_answer) + + def test_quality_check_pickle_mult_no_overlaps_or_merges(self): + + result = ad.AnnotatedDoc.quality_check_sent_splits( + self.pickle_mult_no_overlaps_or_merges) + + self.assertEqual(result, self.pickle_mult_no_overlaps_or_merges_answer) + + def test_quality_check_pickle_mult_subsequent_merges(self): + + result = ad.AnnotatedDoc.quality_check_sent_splits( + self.pickle_mult_subsequent_merges) + + self.assertEqual(result, self.pickle_mult_subsequent_merges_answer) + + class TestDropCounters(unittest.TestCase): """ Tests the functionality of the entity and relation counters in the - AnnotatedDoc class.. + AnnotatedDoc class. """ def setUp(self): @@ -516,31 +1200,33 @@ def setUp(self): self.tmpdir = "tmp" os.makedirs(self.tmpdir, exist_ok=True) - simple_txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " - "She was elected in 2017.") + simple_txt = ( + "Seattle is a rainy city. Jenny Durkan is the city's mayor. " + "She was elected in 2017.") self.simple_txt = f'{self.tmpdir}/mysimplefile.txt' with open(self.simple_txt, 'w') as f: f.write(simple_txt) simple_ann = ("T1\tCity 0 7;13 23\tSeattle\n" - "T2\tPerson 25 37\tJenny Durkan\n" - "T3\tCity 41 51\tthe city's\n" - "T4\tPerson 59 62\tShe\n" - "T5\tPersonnel.Election 67 74\telected\n" - "T6\tYear 78 82\t2017\n" - "T7\tCity 13 23\trainy city\n" - "R1\tIs-A Arg1:T1 Arg2:T7\n" - "R2\tMayor-Of Arg1:T2 Arg2:T3\n" - "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" - "*\tEQUIV T1 T3\n" - "*\tEQUIV T2 T4\n") + "T2\tPerson 25 37\tJenny Durkan\n" + "T3\tCity 41 51\tthe city's\n" + "T4\tPerson 59 62\tShe\n" + "T5\tPersonnel.Election 67 74\telected\n" + "T6\tYear 78 82\t2017\n" + "T7\tCity 13 23\trainy city\n" + "R1\tIs-A Arg1:T1 Arg2:T7\n" + "R2\tMayor-Of Arg1:T2 Arg2:T3\n" + "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" + "*\tEQUIV T1 T3\n" + "*\tEQUIV T2 T4\n") self.simple_ann = f'{self.tmpdir}/mysimplefile.ann' with open(self.simple_ann, 'w') as f: f.write(simple_ann) - complex_txt = ("Global target profile of the kinase inhibitor bosutinib " + complex_txt = ( + "Global target profile of the kinase inhibitor bosutinib " "in primary chronic myeloid leukemia cells.\n" "The detailed molecular mechanism of action of second-generation " "BCR-ABL tyrosine kinase inhibitors, including perturbed targets and " @@ -569,51 +1255,52 @@ def setUp(self): with open(self.complex_txt, 'w') as f: f.write(complex_txt) - complex_ann = ("T10\tCHEMICAL 932 941\tdasatinib\n" - "T11\tCHEMICAL 1090 1099\tBosutinib\n" - "T12\tCHEMICAL 46 55\tbosutinib\n" - "T13\tGENE-Y 1116 1119\tKIT\n" - "T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n" - "T15\tGENE-N 1210 1223\tSTE20 kinases\n" - "T16\tGENE-Y 1272 1275\tABL\n" - "T17\tGENE-N 1276 1281\tT315I\n" - "T18\tGENE-N 1415 1421\tkinase\n" - "T19\tGENE-Y 1448 1454\tCAMK2G\n" - "T1\tCHEMICAL 1242 1251\tbosutinib\n" - "T20\tGENE-Y 402 405\tSRC\n" - "T21\tGENE-Y 406 409\tABL\n" - "T22\tGENE-N 592 598\tkinase\n" - "T23\tGENE-N 634 640\tkinase\n" - "T24\tGENE-Y 163 166\tBCR\n" - "T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n" - "T26\tGENE-Y 167 170\tABL\n" - "T27\tGENE-N 171 186\ttyrosine kinase\n" - "T28\tGENE-N 959 965\tkinase\n" - "T29\tGENE-Y 1057 1060\tSRC\n" - "T2\tCHEMICAL 1392 1401\tbosutinib\n" - "T30\tGENE-Y 1062 1065\tABL\n" - "T31\tGENE-Y 1070 1073\tTEC\n" - "T32\tGENE-N 1081 1088\tkinases\n" - "T33\tGENE-N 29 35\tkinase\n" - "T3\tCHEMICAL 420 429\tbosutinib\n" - "T4\tCHEMICAL 701 710\tbosutinib\n" - "T5\tCHEMICAL 746 754\ttyrosine\n" - "T6\tCHEMICAL 759 765\tserine\n" - "T7\tCHEMICAL 766 775\tthreonine\n" - "T8\tCHEMICAL 843 852\tbosutinib\n" - "T9\tCHEMICAL 917 926\tbosutinib\n" - "R0\tCPR:10 Arg1:T11 Arg2:T13\n" - "R1\tCPR:10 Arg1:T11 Arg2:T14\n" - "R2\tCPR:10 Arg1:T1 Arg2:T16\n" - "R3\tCPR:10 Arg1:T1 Arg2:T17\n" - "R4\tCPR:2 Arg1:T11 Arg2:T15\n" - "R5\tCPR:4 Arg1:T10 Arg2:T28\n" - "R6\tCPR:4 Arg1:T12 Arg2:T33\n" - "R7\tCPR:4 Arg1:T2 Arg2:T18\n" - "R8\tCPR:4 Arg1:T2 Arg2:T19\n" - "R9\tCPR:4 Arg1:T3 Arg2:T20\n" - "R10\tCPR:4 Arg1:T3 Arg2:T21\n" - "R11\tCPR:4 Arg1:T9 Arg2:T28\n") + complex_ann = ( + "T10\tCHEMICAL 932 941\tdasatinib\n" + "T11\tCHEMICAL 1090 1099\tBosutinib\n" + "T12\tCHEMICAL 46 55\tbosutinib\n" + "T13\tGENE-Y 1116 1119\tKIT\n" + "T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n" + "T15\tGENE-N 1210 1223\tSTE20 kinases\n" + "T16\tGENE-Y 1272 1275\tABL\n" + "T17\tGENE-N 1276 1281\tT315I\n" + "T18\tGENE-N 1415 1421\tkinase\n" + "T19\tGENE-Y 1448 1454\tCAMK2G\n" + "T1\tCHEMICAL 1242 1251\tbosutinib\n" + "T20\tGENE-Y 402 405\tSRC\n" + "T21\tGENE-Y 406 409\tABL\n" + "T22\tGENE-N 592 598\tkinase\n" + "T23\tGENE-N 634 640\tkinase\n" + "T24\tGENE-Y 163 166\tBCR\n" + "T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n" + "T26\tGENE-Y 167 170\tABL\n" + "T27\tGENE-N 171 186\ttyrosine kinase\n" + "T28\tGENE-N 959 965\tkinase\n" + "T29\tGENE-Y 1057 1060\tSRC\n" + "T2\tCHEMICAL 1392 1401\tbosutinib\n" + "T30\tGENE-Y 1062 1065\tABL\n" + "T31\tGENE-Y 1070 1073\tTEC\n" + "T32\tGENE-N 1081 1088\tkinases\n" + "T33\tGENE-N 29 35\tkinase\n" + "T3\tCHEMICAL 420 429\tbosutinib\n" + "T4\tCHEMICAL 701 710\tbosutinib\n" + "T5\tCHEMICAL 746 754\ttyrosine\n" + "T6\tCHEMICAL 759 765\tserine\n" + "T7\tCHEMICAL 766 775\tthreonine\n" + "T8\tCHEMICAL 843 852\tbosutinib\n" + "T9\tCHEMICAL 917 926\tbosutinib\n" + "R0\tCPR:10 Arg1:T11 Arg2:T13\n" + "R1\tCPR:10 Arg1:T11 Arg2:T14\n" + "R2\tCPR:10 Arg1:T1 Arg2:T16\n" + "R3\tCPR:10 Arg1:T1 Arg2:T17\n" + "R4\tCPR:2 Arg1:T11 Arg2:T15\n" + "R5\tCPR:4 Arg1:T10 Arg2:T28\n" + "R6\tCPR:4 Arg1:T12 Arg2:T33\n" + "R7\tCPR:4 Arg1:T2 Arg2:T18\n" + "R8\tCPR:4 Arg1:T2 Arg2:T19\n" + "R9\tCPR:4 Arg1:T3 Arg2:T20\n" + "R10\tCPR:4 Arg1:T3 Arg2:T21\n" + "R11\tCPR:4 Arg1:T9 Arg2:T28\n") self.complex_ann = f'{self.tmpdir}/mycomplexfile.ann' with open(self.complex_ann, 'w') as f: diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py index 221e675..28a8a1c 100644 --- a/scripts/new-dataset/annotated_doc.py +++ b/scripts/new-dataset/annotated_doc.py @@ -194,6 +194,8 @@ def format_dygiepp(self): if len(self.events) > 0: # Some datasets don't have events res["events"] = events + res = AnnotatedDoc.quality_check_sent_splits(res) + return res def char_to_token(self): @@ -268,6 +270,190 @@ def char_to_token(self): f'{self.dropped_ents} of {self.total_original_ents} entities ' 'were dropped due to tokenization mismatches.') + @staticmethod + def quality_check_sent_splits(doc_dict): + """ + Function to detect and correct incorrect sentence splits in a dygiepp- + formatted doc dictionary. + + This function relies on the assumption that a cross-sentence entity or + relation in a dygiepp-formatted doc is a result of an incorrect sentence + split on the part of the tokenizer, rather than intentional. If a + cross-sentence entity or relation is found, all sentences between the + sentences containing the two joined entities or entity parts will be + combined into one. + + Example: BioInfer.d70 is one sentence only, with two relations. However, + the conversion to jsonl results in the following doc dictionary: + + {"doc_key": "BioInfer.d70", + "dataset": "bioinfer", + "sentences": [["Aprotinin", "inhibited", "platelet", "aggregation", "induced", + "by", "thrombin", "(", "0.25", "U.ml-1", ")", "with", "IC50", "200", + "kIU.ml-1", ",", "and", "inhibited", "the", "rise", "of", "cytosolic", + "free", "calcium", "concentration", "in", "platelets", "stimulated", "by", + "thrombin", "(", "0.1", "U.ml-1", ")", "in", "the", "absence", "and", "in", + "the", "presence", "of", "Ca2", "+", "0.5", "mmol", "."], + ["L-1", "(","IC50", "117", "and", "50", "kIU.ml-1", ",", "respectively", + ")", ",", "but", "had", "no", "effect", "on", "the", "amounts", "of", + "actin", "and", "myosin", "heavy", "chain", "associated", "with", + "cytoskeletons", "."]], + "ner": [[[29, 29, "Individual_protein"], [0, 0, "Individual_protein"], + [6, 6, "Individual_protein"]], + [[68, 70, "Individual_protein"], [66, 66, "Individual_protein"]]], + "relations": [[[29, 29, 0, 0, "PPI"], [0, 0, 66, 66, "PPI"]], + [[68, 70, 0, 0, "PPI"]]]} + + parameters: + doc_dict, dict: dygiepp-formatted doc + + returns: + doc_dict_corrected, dict: dict with sentence splits corrected + """ + # Get the sentence start and end indices + sent_idxs = [] + for i, sent in enumerate(doc_dict['sentences']): + if i == 0: + sent_start = 0 + else: + sent_start = sent_idxs[i-1][1] + 1 + sent_end = sent_start + len(sent) - 1 + sent_idxs.append((sent_start, sent_end)) + + # For each entity and relation, check if it crosses sentence boundaries + sents_to_join = [] + for i in range(len(doc_dict['sentences'])): + + for ent in doc_dict['ner'][i]: + e_start = ent[0] + e_end = ent[1] + ent_sent_mems = [] + for i, sent in enumerate(sent_idxs): + if sent[0] <= e_start <= sent[1]: + ent_sent_mems.append(i) + if sent[0] <= e_end <= sent[1]: + ent_sent_mems.append(i) + if ent_sent_mems[0] != ent_sent_mems[1]: + ent_sent_mems = tuple(sorted(ent_sent_mems)) + sents_to_join.append(ent_sent_mems) + + for rel in doc_dict['relations'][i]: + e1_start = rel[0] + e2_start = rel[2] + rel_sent_mems = [] + for i, sent in enumerate(sent_idxs): + if sent[0] <= e1_start <= sent[1]: + rel_sent_mems.append(i) + if sent[0] <= e2_start <= sent[1]: + rel_sent_mems.append(i) + if rel_sent_mems[0] != rel_sent_mems[1]: + rel_sent_mems = tuple(sorted(rel_sent_mems)) + sents_to_join.append(rel_sent_mems) + + sents_to_join = list(set([tuple(pair) for pair in sents_to_join])) + + + # Join sentences that need it + if len(sents_to_join) == 0: + doc_dict_corrected = doc_dict + return doc_dict_corrected + else: + doc_dict_corrected = {'doc_key': doc_dict['doc_key'], 'dataset': doc_dict['dataset']} + for key in ['sentences', 'ner', 'relations']: + + # If there are multiples, we need to do some extra processing + if len(sents_to_join) > 1: + + # Merge continuous joins + sents_to_join = AnnotatedDoc.merge_mult_splits(sents_to_join) + + joined = [] + for i, pair in enumerate(sents_to_join): + # Add all sentences before the first to join + first_idx = min(pair) + if i == 0: + add_cand = doc_dict[key][:first_idx] + if len(add_cand) > 0: + joined.extend(add_cand) + + # Join the group and add + last_idx = max(pair) + sent_list = doc_dict[key][first_idx:last_idx+1] + sents_merged = [tok for sent in sent_list for tok in sent] + sents_merged = [sents_merged] + joined.extend(sents_merged) + + # If it's the last merge, add the rest + if (not i == len(doc_dict[key]) - 1) and (i == len(sents_to_join) - 1): + add_cand = doc_dict[key][last_idx + 1:] + if len(add_cand) > 0: + joined.extend(add_cand) + + # If it's not and there's a gap before next merge, add those + elif (last_idx + 1 != sents_to_join[i+1][0]): + add_cand = doc_dict[key][last_idx + 1: sents_to_join[i+1][0]] + if len(add_cand) > 0: + joined.extend(add_cand) + + # Add to new doc + doc_dict_corrected[key] = joined + + if len(sents_to_join) >= 1: + print(f'{len(sents_to_join)} sentence joins were performed ' + f'to fix erroneous sentence splits in doc {doc_dict["doc_key"]}') + + return doc_dict_corrected + + @staticmethod + def merge_mult_splits(sents_to_join): + """ + Given a list of sentence index pairs, determine if any represent multi- + joins (a sentence that was split into multiple fragments), and get the + first and last indices of the continuous split to join. + + parameters: + sents_to_join, list of tuples: pairs of sentence indices + + returns: + final_pairings, list of tuples: first and last indices of + continuous splits + """ + # First sort by the first index + srtd = sorted(sents_to_join, key=lambda x: x[0]) + + # Do a common-sense check that the end indices don't overlap + end_overlaps = [True if srtd[i][1] > srtd[i+1][0] else False + for i in range(len(srtd) - 1)] + assert not any(end_overlaps), ('One or more pairs of sentences to join ' + 'overlaps another') + + # Then get the indices where continuous joins stop + break_idxs = [] + for i in range(len(srtd)-1): + if srtd[i][1] != srtd[i+1][0]: + break_idxs.append(i) + break_idxs = [-1] + break_idxs + + # If the only break is at 0, we can just return the list + if break_idxs == [-1] and len(sents_to_join) == 1: + final_pairings = srtd + return final_pairings + else: + final_pairings = [] + + # Use break indices to get the start and end indices of continuous joins + for i in range(len(break_idxs)): + if i == len(break_idxs) - 1: + cont_join = srtd[break_idxs[i]+1:] + cont_join = (cont_join[0][0], cont_join[-1][1]) + else: + cont_join = srtd[break_idxs[i]+1: break_idxs[i+1]+1] + cont_join = (cont_join[0][0], cont_join[-1][1]) + + final_pairings.append(cont_join) + + return final_pairings + class Ent: def __init__(self, line):