diff --git a/dygie/tests/data/annotated_doc_test.py b/dygie/tests/data/annotated_doc_test.py
index 9897c19..0809ffc 100644
--- a/dygie/tests/data/annotated_doc_test.py
+++ b/dygie/tests/data/annotated_doc_test.py
@@ -16,6 +16,8 @@
 import annotated_doc as ad
 import spacy
 
+verboseprint = print
+
 
 class TestEnt(unittest.TestCase):
     def setUp(self):
@@ -505,10 +507,692 @@ def test_format_dygiepp(self):
         res = annotated_doc.format_dygiepp()
         self.assertEqual(res, self.dygiepp_dict)
 
+
+class TestQualityCheck(unittest.TestCase):
+    """
+    Tests quality_check_sent_splits and its helper, merge_mult_sent_splits.
+    """
+    def setUp(self):
+
+        self.maxDiff = None
+
+        # Helper input/output
+        self.no_merge_no_multi = [(1, 2), (5, 6)]
+        self.no_merge_no_multi_answer = [(1, 2), (5, 6)]
+        self.no_merge_yes_multi = [(1, 4), (5, 6)]
+        self.no_merge_yes_multi_answer = [(1, 4), (5, 6)]
+        self.yes_merge_no_multi = [(2, 3), (3, 4), (4, 5), (6, 7), (8, 9),
+                                   (9, 10)]
+        self.yes_merge_no_multi_answer = [(2, 5), (6, 7), (8, 10)]
+        self.yes_merge_yes_multi_no_overlap = [(2, 5), (6, 7), (8, 9), (9, 10)]
+        self.yes_merge_yes_multi_no_overlap_answer = [(2, 5), (6, 7), (8, 10)]
+        self.yes_merge_yes_multi_yes_overlap = [(2, 9), (8, 9), (9, 10)]
+        self.only_one_merge_pair = [(5,6), (6,7)]
+        self.only_one_merge_pair_answer = [(5,7)]
+
+        # Main func input/output
+        # I'm using real instances for these, may not cover all possible instances
+        # but I've done my best to account for all of those in the helper
+        self.bioinfer_single = {
+            "doc_key":
+            "BioInfer.d70.s0",
+            "dataset":
+            "bioinfer_ppi",
+            "sentences":
+            [[
+                "Aprotinin", "inhibited", "platelet", "aggregation", "induced",
+                "by", "thrombin", "(", "0.25", "U.ml-1", ")", "with", "IC50",
+                "200", "kIU.ml-1", ",", "and", "inhibited", "the", "rise",
+                "of", "cytosolic", "free", "calcium", "concentration", "in",
+                "platelets", "stimulated", "by", "thrombin", "(", "0.1",
+                "U.ml-1", ")", "in", "the", "absence", "and", "in", "the",
+                "presence", "of", "Ca2", "+", "0.5", "mmol", "."
+            ],
+             [
+                 "L-1", "(", "IC50", "117", "and", "50", "kIU.ml-1", ",",
+                 "respectively", ")", ",", "but", "had", "no", "effect", "on",
+                 "the", "amounts", "of", "actin", "and", "myosin", "heavy",
+                 "chain", "associated", "with", "cytoskeletons", "."
+             ]],
+            "ner": [[[29, 29, "Individual_protein"],
+                     [0, 0, "Individual_protein"],
+                     [6, 6, "Individual_protein"]],
+                    [[68, 70, "Individual_protein"],
+                     [66, 66, "Individual_protein"]]],
+            "relations": [[[29, 29, 0, 0, "PPI"], [0, 0, 66, 66, "PPI"]],
+                          [[68, 70, 0, 0, "PPI"]]]
+        }
+        self.bioinfer_single_answer = {
+            "doc_key":
+            "BioInfer.d70.s0",
+            "dataset":
+            "bioinfer_ppi",
+            "sentences": [[
+                "Aprotinin", "inhibited", "platelet", "aggregation", "induced",
+                "by", "thrombin", "(", "0.25", "U.ml-1", ")", "with", "IC50",
+                "200", "kIU.ml-1", ",", "and", "inhibited", "the", "rise",
+                "of", "cytosolic", "free", "calcium", "concentration", "in",
+                "platelets", "stimulated", "by", "thrombin", "(", "0.1",
+                "U.ml-1", ")", "in", "the", "absence", "and", "in", "the",
+                "presence", "of", "Ca2", "+", "0.5", "mmol", ".", "L-1", "(",
+                "IC50", "117", "and", "50", "kIU.ml-1", ",", "respectively",
+                ")", ",", "but", "had", "no", "effect", "on", "the", "amounts",
+                "of", "actin", "and", "myosin", "heavy", "chain", "associated",
+                "with", "cytoskeletons", "."
+            ]],
+            "ner": [[[29, 29, "Individual_protein"],
+                     [0, 0, "Individual_protein"],
+                     [6, 6, "Individual_protein"],
+                     [68, 70, "Individual_protein"],
+                     [66, 66, "Individual_protein"]]],
+            "relations": [[[29, 29, 0, 0, "PPI"], [0, 0, 66, 66, "PPI"],
+                           [68, 70, 0, 0, "PPI"]]]
+        }
+
+        self.pickle_mult_no_overlaps_or_merges = {
+            "doc_key":
+            "PMID12825696_abstract",
+            "dataset":
+            "pickle",
+            "sentences":
+            [[
+                "Ca2", "+", "and", "calmodulin", "(", "CaM", ")", ",", "a",
+                "key", "Ca2", "+", "sensor", "in", "all", "eukaryotes", ",",
+                "have", "been", "implicated", "in", "defense", "responses",
+                "in", "plants", "."
+            ],
+             [
+                 "To", "elucidate", "the", "role", "of", "Ca2", "+", "and",
+                 "CaM", "in", "defense", "signaling", ",", "we", "used",
+                 "35S-labeled", "CaM", "to", "screen", "expression",
+                 "libraries", "prepared", "from", "tissues", "that", "were",
+                 "either", "treated", "with", "an", "elicitor", "derived",
+                 "from", "Phytophthora", "megasperma", "or", "infected",
+                 "with", "Pseudomonas", "syringae", "pv", "."
+             ], ["tabaci", "."],
+             [
+                 "Nineteen", "cDNAs", "that", "encode", "the", "same",
+                 "protein", ",", "pathogen-induced", "CaM-binding", "protein",
+                 "(", "PICBP", ")", ",", "were", "isolated", "."
+             ],
+             [
+                 "The", "PICBP", "fusion", "proteins", "bound", "35S-CaM", ",",
+                 "horseradish", "peroxidase-labeled", "CaM", "and",
+                 "CaM-Sepharose", "in", "the", "presence", "of", "Ca2", "+",
+                 "whereas", "EGTA", ",", "a", "Ca2", "+", "chelator", ",",
+                 "abolished", "binding", ",", "confirming", "that", "PICBP",
+                 "binds", "CaM", "in", "a", "Ca2", "+", "-dependent", "manner",
+                 "."
+             ],
+             [
+                 "Using", "a", "series", "of", "bacterially", "expressed",
+                 "truncated", "versions", "of", "PICBP", ",", "four",
+                 "CaM-binding", "domains", ",", "with", "a", "potential",
+                 "CaM-binding", "consensus", "sequence", "of",
+                 "WSNLKKVILLKRFVKSL", ",", "were", "identified", "."
+             ],
+             [
+                 "The", "deduced", "PICBP", "protein", "sequence", "is",
+                 "rich", "in", "leucine", "residues", "and", "contains",
+                 "three", "classes", "of", "repeats", "."
+             ],
+             [
+                 "The", "PICBP", "gene", "is", "differentially", "expressed",
+                 "in", "tissues", "with", "the", "highest", "expression", "in",
+                 "stem", "."
+             ],
+             [
+                 "The", "expression", "of", "PICBP", "in", "Arabidopsis",
+                 "was", "induced", "in", "response", "to", "avirulent",
+                 "Pseudomonas", "syringae", "pv", "."
+             ], ["tomato", "carrying", "avrRpm1", "."],
+             [
+                 "Furthermore", ",", "PICBP", "is", "constitutively",
+                 "expressed", "in", "the", "Arabidopsis", "accelerated",
+                 "cell", "death2", "-", "2", "mutant", "."
+             ],
+             [
+                 "The", "expression", "of", "PICBP", "in", "bean", "leaves",
+                 "was", "also", "induced", "after", "inoculation", "with",
+                 "avirulent", "and", "non-pathogenic", "bacterial", "strains",
+                 "."
+             ],
+             [
+                 "In", "addition", ",", "the", "hrp1", "mutant", "of",
+                 "Pseudomonas", "syringae", "pv", "."
+             ],
+             [
+                 "tabaci", "and", "inducers", "of", "plant", "defense", "such",
+                 "as", "salicylic", "acid", ",", "hydrogen", "peroxide", "and",
+                 "a", "fungal", "elicitor", "induced", "PICBP", "expression",
+                 "in", "bean", "."
+             ],
+             [
+                 "Our", "data", "suggest", "a", "role", "for", "PICBP", "in",
+                 "Ca2", "+", "-mediated", "defense", "signaling", "and",
+                 "cell-death", "."
+             ],
+             [
+                 "Furthermore", ",", "PICBP", "is", "the", "first",
+                 "identified", "CBP", "in", "eukaryotes", "with", "four",
+                 "Ca2", "+", "-dependent", "CaM-binding", "domains", "."
+             ]],
+            "ner": [[[3, 3, "Protein"], [5, 5, "Protein"], [0, 1, "Element"],
+                     [10, 12, "Protein"]],
+                    [[34, 34, "Protein"], [59, 60, "Multicellular_organism"],
+                     [31, 32, "Element"], [41, 42, "Protein"],
+                     [64, 68, "Unicellular_organism"]], [],
+                    [[82, 82, "Protein"], [78, 80, "Protein"]],
+                    [[107, 107, "Organic_compound_other"], [93, 93, "Protein"],
+                     [95, 97, "Protein"], [99, 99, "Protein"],
+                     [104, 105, "Element"], [119, 119, "Protein"],
+                     [121, 121, "Protein"], [89, 91, "Protein"],
+                     [110, 112, "Organic_compound_other"]],
+                    [[138, 138, "Protein"], [141, 142, "Peptide"],
+                     [151, 151, "Peptide"], [147, 149, "Peptide"]],
+                    [[164, 165, "Amino_acid_monomer"], [158, 159, "Protein"]],
+                    [[174, 175, "DNA"]],
+                    [[193, 193, "Multicellular_organism"], [191, 191, "DNA"],
+                     [199, 204, "Unicellular_organism"]], [[206, 206, "DNA"]],
+                    [[210, 210, "Protein"],
+                     [216, 222, "Multicellular_organism"]],
+                    [[229, 230, "Plant_region"], [227, 227, "DNA"]],
+                    [[247, 248, "Unicellular_organism"],
+                     [250, 254, "Unicellular_organism"]],
+                    [[265, 266, "Inorganic_compound_other"],
+                     [262, 263, "Plant_hormone"],
+                     [272, 273, "Biochemical_process"],
+                     [275, 275, "Multicellular_organism"]],
+                    [[285, 291, "Biochemical_process"], [283, 283, "DNA"]],
+                    [[295, 295, "Protein"], [305, 309, "Peptide"],
+                     [300, 300, "Protein"]]],
+            "relations": [[], [], [], [],
+                          [[119, 119, 121, 121, "interacts"],
+                           [89, 91, 93, 93, "interacts"],
+                           [89, 91, 95, 97, "interacts"],
+                           [89, 91, 99, 99, "interacts"],
+                           [89, 91, 104, 105, "interacts"],
+                           [107, 107, 89, 91, "inhibits"],
+                           [107, 107, 104, 105, "inhibits"]], [],
+                          [[164, 165, 158, 159, "is-in"]], [],
+                          [[191, 191, 193, 193, "is-in"],
+                           [199, 204, 191, 191, "activates"]],
+                          [[206, 206, 199, 204, "is-in"]],
+                          [[210, 210, 216, 222, "is-in"]],
+                          [[227, 227, 229, 230, "is-in"]],
+                          [[247, 248, 272, 273, "activates"]],
+                          [[262, 263, 272, 273, "activates"],
+                           [265, 266, 272, 273, "activates"],
+                           [272, 273, 275, 275, "is-in"]],
+                          [[283, 283, 285, 291, "is-in"]],
+                          [[305, 309, 295, 295, "is-in"]]]
+        }
+
+        self.pickle_mult_no_overlaps_or_merges_answer = {
+            "doc_key":
+            "PMID12825696_abstract",
+            "dataset":
+            "pickle",
+            "sentences":
+            [[
+                "Ca2", "+", "and", "calmodulin", "(", "CaM", ")", ",", "a",
+                "key", "Ca2", "+", "sensor", "in", "all", "eukaryotes", ",",
+                "have", "been", "implicated", "in", "defense", "responses",
+                "in", "plants", "."
+            ],
+             [
+                 "To", "elucidate", "the", "role", "of", "Ca2", "+", "and",
+                 "CaM", "in", "defense", "signaling", ",", "we", "used",
+                 "35S-labeled", "CaM", "to", "screen", "expression",
+                 "libraries", "prepared", "from", "tissues", "that", "were",
+                 "either", "treated", "with", "an", "elicitor", "derived",
+                 "from", "Phytophthora", "megasperma", "or", "infected",
+                 "with", "Pseudomonas", "syringae", "pv", ".", "tabaci", "."
+             ],
+             [
+                 "Nineteen", "cDNAs", "that", "encode", "the", "same",
+                 "protein", ",", "pathogen-induced", "CaM-binding", "protein",
+                 "(", "PICBP", ")", ",", "were", "isolated", "."
+             ],
+             [
+                 "The", "PICBP", "fusion", "proteins", "bound", "35S-CaM", ",",
+                 "horseradish", "peroxidase-labeled", "CaM", "and",
+                 "CaM-Sepharose", "in", "the", "presence", "of", "Ca2", "+",
+                 "whereas", "EGTA", ",", "a", "Ca2", "+", "chelator", ",",
+                 "abolished", "binding", ",", "confirming", "that", "PICBP",
+                 "binds", "CaM", "in", "a", "Ca2", "+", "-dependent", "manner",
+                 "."
+             ],
+             [
+                 "Using", "a", "series", "of", "bacterially", "expressed",
+                 "truncated", "versions", "of", "PICBP", ",", "four",
+                 "CaM-binding", "domains", ",", "with", "a", "potential",
+                 "CaM-binding", "consensus", "sequence", "of",
+                 "WSNLKKVILLKRFVKSL", ",", "were", "identified", "."
+             ],
+             [
+                 "The", "deduced", "PICBP", "protein", "sequence", "is",
+                 "rich", "in", "leucine", "residues", "and", "contains",
+                 "three", "classes", "of", "repeats", "."
+             ],
+             [
+                 "The", "PICBP", "gene", "is", "differentially", "expressed",
+                 "in", "tissues", "with", "the", "highest", "expression", "in",
+                 "stem", "."
+             ],
+             [
+                 "The", "expression", "of", "PICBP", "in", "Arabidopsis",
+                 "was", "induced", "in", "response", "to", "avirulent",
+                 "Pseudomonas", "syringae", "pv", ".", "tomato", "carrying",
+                 "avrRpm1", "."
+             ],
+             [
+                 "Furthermore", ",", "PICBP", "is", "constitutively",
+                 "expressed", "in", "the", "Arabidopsis", "accelerated",
+                 "cell", "death2", "-", "2", "mutant", "."
+             ],
+             [
+                 "The", "expression", "of", "PICBP", "in", "bean", "leaves",
+                 "was", "also", "induced", "after", "inoculation", "with",
+                 "avirulent", "and", "non-pathogenic", "bacterial", "strains",
+                 "."
+             ],
+             [
+                 "In", "addition", ",", "the", "hrp1", "mutant", "of",
+                 "Pseudomonas", "syringae", "pv", ".", "tabaci", "and",
+                 "inducers", "of", "plant", "defense", "such", "as",
+                 "salicylic", "acid", ",", "hydrogen", "peroxide", "and", "a",
+                 "fungal", "elicitor", "induced", "PICBP", "expression", "in",
+                 "bean", "."
+             ],
+             [
+                 "Our", "data", "suggest", "a", "role", "for", "PICBP", "in",
+                 "Ca2", "+", "-mediated", "defense", "signaling", "and",
+                 "cell-death", "."
+             ],
+             [
+                 "Furthermore", ",", "PICBP", "is", "the", "first",
+                 "identified", "CBP", "in", "eukaryotes", "with", "four",
+                 "Ca2", "+", "-dependent", "CaM-binding", "domains", "."
+             ]],
+            "ner": [[[3, 3, "Protein"], [5, 5, "Protein"], [0, 1, "Element"],
+                     [10, 12, "Protein"]],
+                    [[34, 34, "Protein"], [59, 60, "Multicellular_organism"],
+                     [31, 32, "Element"], [41, 42, "Protein"],
+                     [64, 68, "Unicellular_organism"]],
+                    [[82, 82, "Protein"], [78, 80, "Protein"]],
+                    [[107, 107, "Organic_compound_other"], [93, 93, "Protein"],
+                     [95, 97, "Protein"], [99, 99, "Protein"],
+                     [104, 105, "Element"], [119, 119, "Protein"],
+                     [121, 121, "Protein"], [89, 91, "Protein"],
+                     [110, 112, "Organic_compound_other"]],
+                    [[138, 138, "Protein"], [141, 142, "Peptide"],
+                     [151, 151, "Peptide"], [147, 149, "Peptide"]],
+                    [[164, 165, "Amino_acid_monomer"], [158, 159, "Protein"]],
+                    [[174, 175, "DNA"]],
+                    [[193, 193, "Multicellular_organism"], [191, 191, "DNA"],
+                     [199, 204, "Unicellular_organism"], [206, 206, "DNA"]],
+                    [[210, 210, "Protein"],
+                     [216, 222, "Multicellular_organism"]],
+                    [[229, 230, "Plant_region"], [227, 227, "DNA"]],
+                    [[247, 248, "Unicellular_organism"],
+                     [250, 254, "Unicellular_organism"],
+                     [265, 266, "Inorganic_compound_other"],
+                     [262, 263, "Plant_hormone"],
+                     [272, 273, "Biochemical_process"],
+                     [275, 275, "Multicellular_organism"]],
+                    [[285, 291, "Biochemical_process"], [283, 283, "DNA"]],
+                    [[295, 295, "Protein"], [305, 309, "Peptide"],
+                     [300, 300, "Protein"]]],
+            "relations": [[], [], [],
+                          [[119, 119, 121, 121, "interacts"],
+                           [89, 91, 93, 93, "interacts"],
+                           [89, 91, 95, 97, "interacts"],
+                           [89, 91, 99, 99, "interacts"],
+                           [89, 91, 104, 105, "interacts"],
+                           [107, 107, 89, 91, "inhibits"],
+                           [107, 107, 104, 105, "inhibits"]], [],
+                          [[164, 165, 158, 159, "is-in"]], [],
+                          [[191, 191, 193, 193, "is-in"],
+                           [199, 204, 191, 191, "activates"],
+                           [206, 206, 199, 204, "is-in"]],
+                          [[210, 210, 216, 222, "is-in"]],
+                          [[227, 227, 229, 230, "is-in"]],
+                          [[247, 248, 272, 273, "activates"],
+                           [262, 263, 272, 273, "activates"],
+                           [265, 266, 272, 273, "activates"],
+                           [272, 273, 275, 275, "is-in"]],
+                          [[283, 283, 285, 291, "is-in"]],
+                          [[305, 309, 295, 295, "is-in"]]]
+        }
+
+        self.pickle_mult_subsequent_merges = {
+            "doc_key":
+            "PMID28911019_abstract",
+            "dataset":
+            "pickle",
+            "sentences":
+            [[
+                "BACKGROUND", "AND", "AIMS", ":", "Selected", "beneficial",
+                "Pseudomonas", "spp", ".", "strains", "have", "the", "ability",
+                "to", "influence", "root", "architecture", "in", "Arabidopsis",
+                "thaliana", "by", "inhibiting", "primary", "root", "elongation",
+                "and", "promoting", "lateral", "root", "and", "root", "hair",
+                "formation", "."
+            ],
+             [
+                 "A", "crucial", "role", "for", "auxin", "in", "this", "long-term",
+                 "(", "1week", ")", ",", "long-distance", "plant-microbe",
+                 "interaction", "has", "been", "demonstrated", "."
+             ],
+             [
+                 "METHODS", ":", "Arabidopsis", "seedlings", "were", "cultivated",
+                 "in", "vitro", "on", "vertical", "plates", "and", "inoculated",
+                 "with", "pathogenic", "strains", "Pseudomonas", "syringae", "pv",
+                 "."
+             ],
+             ["maculicola", "(", "Psm", ")", "and", "P.", "syringae", "pv", "."],
+             [
+                 "tomato", "DC3000", "(", "Pst", ")", ",", "as", "well", "as",
+                 "Agrobacterium", "tumefaciens", "(", "Atu", ")", "and",
+                 "Escherichia", "coli", "(", "Eco", ")", "."
+             ],
+             [
+                 "Root", "hair", "lengths", "were", "measured", "after", "24",
+                 "and", "48h", "of", "direct", "exposure", "to", "each",
+                 "bacterial", "strain", "."
+             ],
+             [
+                 "Several", "Arabidopsis", "mutants", "with", "impaired",
+                 "responses", "to", "pathogens", ",", "impaired", "ethylene",
+                 "perception", "and", "defects", "in", "the", "exocyst", "vesicle",
+                 "tethering", "complex", "that", "is", "involved", "in",
+                 "secretion", "were", "also", "analysed", "."
+             ],
+             [
+                 "KEY", "RESULTS", ":", "Arabidopsis", "seedling", "roots",
+                 "infected", "with", "Psm", "or", "Pst", "responded", "similarly",
+                 "to", "when", "infected", "with", "plant", "growth-promoting",
+                 "rhizobacteria", ";", "root", "hair", "growth", "was",
+                 "stimulated", "and", "primary", "root", "growth", "was",
+                 "inhibited", "."
+             ],
+             [
+                 "Other", "plant-", "and", "soil-adapted", "bacteria", "induced",
+                 "similar", "root", "hair", "responses", "."
+             ],
+             [
+                 "The", "most", "compromised", "root", "hair", "growth",
+                 "stimulation", "response", "was", "found", "for", "the",
+                 "knockout", "mutants", "exo70A1", "and", "ein2", "."
+             ],
+             [
+                 "The", "single", "immune", "pathways", "dependent", "on",
+                 "salicylic", "acid", ",", "jasmonic", "acid", "and", "PAD4",
+                 "are", "not", "directly", "involved", "in", "root", "hair",
+                 "growth", "stimulation", ";", "however", ",", "in", "the",
+                 "mutual", "cross-talk", "with", "ethylene", ",", "they",
+                 "indirectly", "modify", "the", "extent", "of", "the",
+                 "stimulation", "of", "root", "hair", "growth", "."
+             ],
+             [
+                 "The", "Flg22", "peptide", "does", "not", "initiate", "root",
+                 "hair", "stimulation", "as", "intact", "bacteria", "do", ",",
+                 "but", "pretreatment", "with", "Flg22", "prior", "to", "Psm",
+                 "inoculation", "abolished", "root", "hair", "growth",
+                 "stimulation", "in", "an", "FLS2", "receptor", "kinase-dependent",
+                 "manner", "."
+             ],
+             [
+                 "These", "early", "response", "phenomena", "are", "not",
+                 "associated", "with", "changes", "in", "auxin", "levels", ",",
+                 "as", "monitored", "with", "the", "pDR5::GUS", "auxin",
+                 "reporter", "."
+             ],
+             [
+                 "CONCLUSIONS", ":", "Early", "stimulation", "of", "root", "hair",
+                 "growth", "is", "an", "effect", "of", "an", "unidentified",
+                 "component", "of", "living", "plant", "pathogenic", "bacteria",
+                 "."
+             ],
+             [
+                 "The", "root", "hair", "growth", "response", "is", "triggered",
+                 "in", "the", "range", "of", "hours", "after", "bacterial",
+                 "contact", "with", "roots", "and", "can", "be", "modulated", "by",
+                 "FLS2", "signalling", "."
+             ],
+             [
+                 "Bacterial", "stimulation", "of", "root", "hair", "growth",
+                 "requires", "functional", "ethylene", "signalling", "and", "an",
+                 "efficient", "exocyst-dependent", "secretory", "machinery", "."
+             ]],
+            "ner": [[[6, 9, "Unicellular_organism"],
+                     [18, 19, "Multicellular_organism"]],
+                    [[38, 38, "Plant_hormone"]],
+                    [[55, 56, "Multicellular_organism"],
+                     [69, 73, "Unicellular_organism"]],
+                    [[75, 75, "Unicellular_organism"],
+                     [78, 83, "Unicellular_organism"]],
+                    [[85, 85, "Unicellular_organism"],
+                     [91, 92, "Unicellular_organism"],
+                     [94, 94, "Unicellular_organism"],
+                     [97, 98, "Unicellular_organism"],
+                     [100, 100, "Unicellular_organism"]], [],
+                    [[121, 122, "Multicellular_organism"],
+                     [130, 131, "Biochemical_process"]],
+                    [[152, 154, "Plant_region"],
+                     [157, 157, "Unicellular_organism"],
+                     [159, 159, "Unicellular_organism"]], [],
+                    [[207, 207, "Multicellular_organism"],
+                     [209, 209, "Multicellular_organism"]],
+                    [[217, 218, "Plant_hormone"], [220, 221, "Plant_hormone"],
+                     [223, 223, "Protein"], [241, 241, "Plant_hormone"]],
+                    [[257, 258, "Peptide"], [273, 273, "Peptide"],
+                     [276, 276, "Unicellular_organism"]],
+                    [[300, 300, "Plant_hormone"], [307, 309, "DNA"]], [],
+                    [[354, 355, "Biochemical_process"]],
+                    [[365, 366, "Biochemical_process"]]],
+            "relations": [[[6, 9, 18, 19, "interacts"]], [], [], [], [], [], [],
+                          [[157, 157, 152, 154, "interacts"],
+                           [159, 159, 152, 154, "interacts"]], [], [],
+                          [[217, 218, 241, 241, "interacts"],
+                           [220, 221, 241, 241, "interacts"],
+                           [223, 223, 241, 241, "interacts"]], [], [], [], [], []]
+        }
+    
+        self.pickle_mult_subsequent_merges_answer = {
+            "doc_key":
+            "PMID28911019_abstract",
+            "dataset":
+            "pickle",
+            "sentences":
+            [[
+                "BACKGROUND", "AND", "AIMS", ":", "Selected", "beneficial",
+                "Pseudomonas", "spp", ".", "strains", "have", "the", "ability",
+                "to", "influence", "root", "architecture", "in", "Arabidopsis",
+                "thaliana", "by", "inhibiting", "primary", "root", "elongation",
+                "and", "promoting", "lateral", "root", "and", "root", "hair",
+                "formation", "."
+            ],
+             [
+                 "A", "crucial", "role", "for", "auxin", "in", "this", "long-term",
+                 "(", "1week", ")", ",", "long-distance", "plant-microbe",
+                 "interaction", "has", "been", "demonstrated", "."
+             ],
+             [
+                 "METHODS", ":", "Arabidopsis", "seedlings", "were", "cultivated",
+                 "in", "vitro", "on", "vertical", "plates", "and", "inoculated",
+                 "with", "pathogenic", "strains", "Pseudomonas", "syringae", "pv",
+                 ".", "maculicola", "(", "Psm", ")", "and", "P.", "syringae", "pv",
+                 ".", "tomato", "DC3000", "(", "Pst", ")", ",", "as", "well", "as",
+                 "Agrobacterium", "tumefaciens", "(", "Atu", ")", "and",
+                 "Escherichia", "coli", "(", "Eco", ")", "."
+             ],
+             [
+                 "Root", "hair", "lengths", "were", "measured", "after", "24",
+                 "and", "48h", "of", "direct", "exposure", "to", "each",
+                 "bacterial", "strain", "."
+             ],
+             [
+                 "Several", "Arabidopsis", "mutants", "with", "impaired",
+                 "responses", "to", "pathogens", ",", "impaired", "ethylene",
+                 "perception", "and", "defects", "in", "the", "exocyst", "vesicle",
+                 "tethering", "complex", "that", "is", "involved", "in",
+                 "secretion", "were", "also", "analysed", "."
+             ],
+             [
+                 "KEY", "RESULTS", ":", "Arabidopsis", "seedling", "roots",
+                 "infected", "with", "Psm", "or", "Pst", "responded", "similarly",
+                 "to", "when", "infected", "with", "plant", "growth-promoting",
+                 "rhizobacteria", ";", "root", "hair", "growth", "was",
+                 "stimulated", "and", "primary", "root", "growth", "was",
+                 "inhibited", "."
+             ],
+             [
+                 "Other", "plant-", "and", "soil-adapted", "bacteria", "induced",
+                 "similar", "root", "hair", "responses", "."
+             ],
+             [
+                 "The", "most", "compromised", "root", "hair", "growth",
+                 "stimulation", "response", "was", "found", "for", "the",
+                 "knockout", "mutants", "exo70A1", "and", "ein2", "."
+             ],
+             [
+                 "The", "single", "immune", "pathways", "dependent", "on",
+                 "salicylic", "acid", ",", "jasmonic", "acid", "and", "PAD4",
+                 "are", "not", "directly", "involved", "in", "root", "hair",
+                 "growth", "stimulation", ";", "however", ",", "in", "the",
+                 "mutual", "cross-talk", "with", "ethylene", ",", "they",
+                 "indirectly", "modify", "the", "extent", "of", "the",
+                 "stimulation", "of", "root", "hair", "growth", "."
+             ],
+             [
+                 "The", "Flg22", "peptide", "does", "not", "initiate", "root",
+                 "hair", "stimulation", "as", "intact", "bacteria", "do", ",",
+                 "but", "pretreatment", "with", "Flg22", "prior", "to", "Psm",
+                 "inoculation", "abolished", "root", "hair", "growth",
+                 "stimulation", "in", "an", "FLS2", "receptor", "kinase-dependent",
+                 "manner", "."
+             ],
+             [
+                 "These", "early", "response", "phenomena", "are", "not",
+                 "associated", "with", "changes", "in", "auxin", "levels", ",",
+                 "as", "monitored", "with", "the", "pDR5::GUS", "auxin",
+                 "reporter", "."
+             ],
+             [
+                 "CONCLUSIONS", ":", "Early", "stimulation", "of", "root", "hair",
+                 "growth", "is", "an", "effect", "of", "an", "unidentified",
+                 "component", "of", "living", "plant", "pathogenic", "bacteria",
+                 "."
+             ],
+             [
+                 "The", "root", "hair", "growth", "response", "is", "triggered",
+                 "in", "the", "range", "of", "hours", "after", "bacterial",
+                 "contact", "with", "roots", "and", "can", "be", "modulated", "by",
+                 "FLS2", "signalling", "."
+             ],
+             [
+                 "Bacterial", "stimulation", "of", "root", "hair", "growth",
+                 "requires", "functional", "ethylene", "signalling", "and", "an",
+                 "efficient", "exocyst-dependent", "secretory", "machinery", "."
+             ]],
+            "ner": [[[6, 9, "Unicellular_organism"],
+                     [18, 19, "Multicellular_organism"]],
+                    [[38, 38, "Plant_hormone"]],
+                    [[55, 56, "Multicellular_organism"],
+                     [69, 73, "Unicellular_organism"],
+                     [75, 75, "Unicellular_organism"],
+                     [78, 83, "Unicellular_organism"],
+                     [85, 85, "Unicellular_organism"],
+                     [91, 92, "Unicellular_organism"],
+                     [94, 94, "Unicellular_organism"],
+                     [97, 98, "Unicellular_organism"],
+                     [100, 100, "Unicellular_organism"]], [],
+                    [[121, 122, "Multicellular_organism"],
+                     [130, 131, "Biochemical_process"]],
+                    [[152, 154, "Plant_region"],
+                     [157, 157, "Unicellular_organism"],
+                     [159, 159, "Unicellular_organism"]], [],
+                    [[207, 207, "Multicellular_organism"],
+                     [209, 209, "Multicellular_organism"]],
+                    [[217, 218, "Plant_hormone"], [220, 221, "Plant_hormone"],
+                     [223, 223, "Protein"], [241, 241, "Plant_hormone"]],
+                    [[257, 258, "Peptide"], [273, 273, "Peptide"],
+                     [276, 276, "Unicellular_organism"]],
+                    [[300, 300, "Plant_hormone"], [307, 309, "DNA"]], [],
+                    [[354, 355, "Biochemical_process"]],
+                    [[365, 366, "Biochemical_process"]]],
+            "relations": [[[6, 9, 18, 19, "interacts"]], [], [], [], [],
+                          [[157, 157, 152, 154, "interacts"],
+                           [159, 159, 152, 154, "interacts"]], [], [],
+                          [[217, 218, 241, 241, "interacts"],
+                           [220, 221, 241, 241, "interacts"],
+                           [223, 223, 241, 241, "interacts"]], [], [], [], [], []]
+        }
+
+    def test_merge_mult_splits_no_merge_no_multi(self):
+
+        result = ad.AnnotatedDoc.merge_mult_splits(self.no_merge_no_multi)
+
+        self.assertEqual(result, self.no_merge_no_multi_answer)
+
+    def test_merge_mult_splits_no_merge_yes_multi(self):
+
+        result = ad.AnnotatedDoc.merge_mult_splits(self.no_merge_yes_multi)
+
+        self.assertEqual(result, self.no_merge_yes_multi_answer)
+
+    def test_merge_mult_splits_yes_merge_no_multi(self):
+
+        result = ad.AnnotatedDoc.merge_mult_splits(self.yes_merge_no_multi)
+
+        self.assertEqual(result, self.yes_merge_no_multi_answer)
+
+    def test_merge_mult_splits_yes_merge_yes_multi_no_overlap(self):
+
+        result = ad.AnnotatedDoc.merge_mult_splits(
+            self.yes_merge_yes_multi_no_overlap)
+
+        self.assertEqual(result, self.yes_merge_yes_multi_no_overlap_answer)
+
+    def test_merge_mult_splits_yes_merge_yes_multi_yes_overlap(self):
+
+        self.assertRaises(AssertionError, ad.AnnotatedDoc.merge_mult_splits,
+                          self.yes_merge_yes_multi_yes_overlap)
+
+    def test_merge_mult_splits_only_one_merge_pair(self):
+
+        result = ad.AnnotatedDoc.merge_mult_splits(self.only_one_merge_pair)
+
+        self.assertEqual(result, self.only_one_merge_pair_answer)
+
+    def test_quality_check_bioinfer_single(self):
+
+        result = ad.AnnotatedDoc.quality_check_sent_splits(
+            self.bioinfer_single)
+
+        self.assertEqual(result, self.bioinfer_single_answer)
+
+    def test_quality_check_pickle_mult_no_overlaps_or_merges(self):
+
+        result = ad.AnnotatedDoc.quality_check_sent_splits(
+            self.pickle_mult_no_overlaps_or_merges)
+
+        self.assertEqual(result, self.pickle_mult_no_overlaps_or_merges_answer)
+
+    def test_quality_check_pickle_mult_subsequent_merges(self):
+
+        result = ad.AnnotatedDoc.quality_check_sent_splits(
+            self.pickle_mult_subsequent_merges)
+
+        self.assertEqual(result, self.pickle_mult_subsequent_merges_answer)
+
+
 class TestDropCounters(unittest.TestCase):
     """
     Tests the functionality of the entity and relation counters in the
-    AnnotatedDoc class..
+    AnnotatedDoc class.
     """
     def setUp(self):
 
@@ -516,31 +1200,33 @@ def setUp(self):
         self.tmpdir = "tmp"
         os.makedirs(self.tmpdir, exist_ok=True)
 
-        simple_txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
-               "She was elected in 2017.")
+        simple_txt = (
+            "Seattle is a rainy city. Jenny Durkan is the city's mayor. "
+            "She was elected in 2017.")
 
         self.simple_txt = f'{self.tmpdir}/mysimplefile.txt'
         with open(self.simple_txt, 'w') as f:
             f.write(simple_txt)
 
         simple_ann = ("T1\tCity 0 7;13 23\tSeattle\n"
-               "T2\tPerson 25 37\tJenny Durkan\n"
-               "T3\tCity 41 51\tthe city's\n"
-               "T4\tPerson 59 62\tShe\n"
-               "T5\tPersonnel.Election 67 74\telected\n"
-               "T6\tYear 78 82\t2017\n"
-               "T7\tCity 13 23\trainy city\n"
-               "R1\tIs-A Arg1:T1 Arg2:T7\n"
-               "R2\tMayor-Of Arg1:T2 Arg2:T3\n"
-               "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
-               "*\tEQUIV T1 T3\n"
-               "*\tEQUIV T2 T4\n")
+                      "T2\tPerson 25 37\tJenny Durkan\n"
+                      "T3\tCity 41 51\tthe city's\n"
+                      "T4\tPerson 59 62\tShe\n"
+                      "T5\tPersonnel.Election 67 74\telected\n"
+                      "T6\tYear 78 82\t2017\n"
+                      "T7\tCity 13 23\trainy city\n"
+                      "R1\tIs-A Arg1:T1 Arg2:T7\n"
+                      "R2\tMayor-Of Arg1:T2 Arg2:T3\n"
+                      "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
+                      "*\tEQUIV T1 T3\n"
+                      "*\tEQUIV T2 T4\n")
 
         self.simple_ann = f'{self.tmpdir}/mysimplefile.ann'
         with open(self.simple_ann, 'w') as f:
             f.write(simple_ann)
 
-        complex_txt = ("Global target profile of the kinase inhibitor bosutinib "
+        complex_txt = (
+            "Global target profile of the kinase inhibitor bosutinib "
             "in primary chronic myeloid leukemia cells.\n"
             "The detailed molecular mechanism of action of second-generation "
             "BCR-ABL tyrosine kinase inhibitors, including perturbed targets and "
@@ -569,51 +1255,52 @@ def setUp(self):
         with open(self.complex_txt, 'w') as f:
             f.write(complex_txt)
 
-        complex_ann = ("T10\tCHEMICAL 932 941\tdasatinib\n"
-                "T11\tCHEMICAL 1090 1099\tBosutinib\n"
-                "T12\tCHEMICAL 46 55\tbosutinib\n"
-                "T13\tGENE-Y 1116 1119\tKIT\n"
-                "T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n"
-                "T15\tGENE-N 1210 1223\tSTE20 kinases\n"
-                "T16\tGENE-Y 1272 1275\tABL\n"
-                "T17\tGENE-N 1276 1281\tT315I\n"
-                "T18\tGENE-N 1415 1421\tkinase\n"
-                "T19\tGENE-Y 1448 1454\tCAMK2G\n"
-                "T1\tCHEMICAL 1242 1251\tbosutinib\n"
-                "T20\tGENE-Y 402 405\tSRC\n"
-                "T21\tGENE-Y 406 409\tABL\n"
-                "T22\tGENE-N 592 598\tkinase\n"
-                "T23\tGENE-N 634 640\tkinase\n"
-                "T24\tGENE-Y 163 166\tBCR\n"
-                "T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n"
-                "T26\tGENE-Y 167 170\tABL\n"
-                "T27\tGENE-N 171 186\ttyrosine kinase\n"
-                "T28\tGENE-N 959 965\tkinase\n"
-                "T29\tGENE-Y 1057 1060\tSRC\n"
-                "T2\tCHEMICAL 1392 1401\tbosutinib\n"
-                "T30\tGENE-Y 1062 1065\tABL\n"
-                "T31\tGENE-Y 1070 1073\tTEC\n"
-                "T32\tGENE-N 1081 1088\tkinases\n"
-                "T33\tGENE-N 29 35\tkinase\n"
-                "T3\tCHEMICAL 420 429\tbosutinib\n"
-                "T4\tCHEMICAL 701 710\tbosutinib\n"
-                "T5\tCHEMICAL 746 754\ttyrosine\n"
-                "T6\tCHEMICAL 759 765\tserine\n"
-                "T7\tCHEMICAL 766 775\tthreonine\n"
-                "T8\tCHEMICAL 843 852\tbosutinib\n"
-                "T9\tCHEMICAL 917 926\tbosutinib\n"
-                "R0\tCPR:10 Arg1:T11 Arg2:T13\n"
-                "R1\tCPR:10 Arg1:T11 Arg2:T14\n"
-                "R2\tCPR:10 Arg1:T1 Arg2:T16\n"
-                "R3\tCPR:10 Arg1:T1 Arg2:T17\n"
-                "R4\tCPR:2 Arg1:T11 Arg2:T15\n"
-                "R5\tCPR:4 Arg1:T10 Arg2:T28\n"
-                "R6\tCPR:4 Arg1:T12 Arg2:T33\n"
-                "R7\tCPR:4 Arg1:T2 Arg2:T18\n"
-                "R8\tCPR:4 Arg1:T2 Arg2:T19\n"
-                "R9\tCPR:4 Arg1:T3 Arg2:T20\n"
-                "R10\tCPR:4 Arg1:T3 Arg2:T21\n"
-                "R11\tCPR:4 Arg1:T9 Arg2:T28\n")
+        complex_ann = (
+            "T10\tCHEMICAL 932 941\tdasatinib\n"
+            "T11\tCHEMICAL 1090 1099\tBosutinib\n"
+            "T12\tCHEMICAL 46 55\tbosutinib\n"
+            "T13\tGENE-Y 1116 1119\tKIT\n"
+            "T14\tGENE-N 1123 1162\tplatelet-derived growth factor receptor\n"
+            "T15\tGENE-N 1210 1223\tSTE20 kinases\n"
+            "T16\tGENE-Y 1272 1275\tABL\n"
+            "T17\tGENE-N 1276 1281\tT315I\n"
+            "T18\tGENE-N 1415 1421\tkinase\n"
+            "T19\tGENE-Y 1448 1454\tCAMK2G\n"
+            "T1\tCHEMICAL 1242 1251\tbosutinib\n"
+            "T20\tGENE-Y 402 405\tSRC\n"
+            "T21\tGENE-Y 406 409\tABL\n"
+            "T22\tGENE-N 592 598\tkinase\n"
+            "T23\tGENE-N 634 640\tkinase\n"
+            "T24\tGENE-Y 163 166\tBCR\n"
+            "T25\tGENE-N 746 783\ttyrosine and serine/threonine kinases\n"
+            "T26\tGENE-Y 167 170\tABL\n"
+            "T27\tGENE-N 171 186\ttyrosine kinase\n"
+            "T28\tGENE-N 959 965\tkinase\n"
+            "T29\tGENE-Y 1057 1060\tSRC\n"
+            "T2\tCHEMICAL 1392 1401\tbosutinib\n"
+            "T30\tGENE-Y 1062 1065\tABL\n"
+            "T31\tGENE-Y 1070 1073\tTEC\n"
+            "T32\tGENE-N 1081 1088\tkinases\n"
+            "T33\tGENE-N 29 35\tkinase\n"
+            "T3\tCHEMICAL 420 429\tbosutinib\n"
+            "T4\tCHEMICAL 701 710\tbosutinib\n"
+            "T5\tCHEMICAL 746 754\ttyrosine\n"
+            "T6\tCHEMICAL 759 765\tserine\n"
+            "T7\tCHEMICAL 766 775\tthreonine\n"
+            "T8\tCHEMICAL 843 852\tbosutinib\n"
+            "T9\tCHEMICAL 917 926\tbosutinib\n"
+            "R0\tCPR:10 Arg1:T11 Arg2:T13\n"
+            "R1\tCPR:10 Arg1:T11 Arg2:T14\n"
+            "R2\tCPR:10 Arg1:T1 Arg2:T16\n"
+            "R3\tCPR:10 Arg1:T1 Arg2:T17\n"
+            "R4\tCPR:2 Arg1:T11 Arg2:T15\n"
+            "R5\tCPR:4 Arg1:T10 Arg2:T28\n"
+            "R6\tCPR:4 Arg1:T12 Arg2:T33\n"
+            "R7\tCPR:4 Arg1:T2 Arg2:T18\n"
+            "R8\tCPR:4 Arg1:T2 Arg2:T19\n"
+            "R9\tCPR:4 Arg1:T3 Arg2:T20\n"
+            "R10\tCPR:4 Arg1:T3 Arg2:T21\n"
+            "R11\tCPR:4 Arg1:T9 Arg2:T28\n")
 
         self.complex_ann = f'{self.tmpdir}/mycomplexfile.ann'
         with open(self.complex_ann, 'w') as f:
diff --git a/scripts/new-dataset/annotated_doc.py b/scripts/new-dataset/annotated_doc.py
index 221e675..28a8a1c 100644
--- a/scripts/new-dataset/annotated_doc.py
+++ b/scripts/new-dataset/annotated_doc.py
@@ -194,6 +194,8 @@ def format_dygiepp(self):
         if len(self.events) > 0:  # Some datasets don't have events
             res["events"] = events
 
+        res = AnnotatedDoc.quality_check_sent_splits(res)
+
         return res
 
     def char_to_token(self):
@@ -268,6 +270,190 @@ def char_to_token(self):
             f'{self.dropped_ents} of {self.total_original_ents} entities '
             'were dropped due to tokenization mismatches.')
 
+    @staticmethod
+    def quality_check_sent_splits(doc_dict):
+        """
+        Function to detect and correct incorrect sentence splits in a dygiepp-
+        formatted doc dictionary.
+
+        This function relies on the assumption that a cross-sentence entity or
+        relation in a dygiepp-formatted doc is a result of an incorrect sentence
+        split on the part of the tokenizer, rather than intentional. If a
+        cross-sentence entity or relation is found, all sentences between the
+        sentences containing the two joined entities or entity parts will be
+        combined into one.
+        
+        Example: BioInfer.d70 is one sentence only, with two relations. However,
+        the conversion to jsonl results in the following doc dictionary:
+        
+        {"doc_key": "BioInfer.d70",
+        "dataset": "bioinfer",
+        "sentences": [["Aprotinin", "inhibited", "platelet", "aggregation", "induced",
+                "by", "thrombin", "(", "0.25", "U.ml-1", ")", "with", "IC50", "200",
+                "kIU.ml-1", ",", "and", "inhibited", "the", "rise", "of", "cytosolic",
+                "free", "calcium", "concentration", "in", "platelets", "stimulated", "by",
+                "thrombin", "(", "0.1", "U.ml-1", ")", "in", "the", "absence", "and", "in",
+                "the", "presence", "of", "Ca2", "+", "0.5", "mmol", "."],
+            ["L-1", "(","IC50", "117", "and", "50", "kIU.ml-1", ",", "respectively",
+                ")", ",", "but", "had", "no", "effect", "on", "the", "amounts", "of",
+                "actin", "and", "myosin", "heavy", "chain", "associated", "with",
+                "cytoskeletons", "."]],
+        "ner": [[[29, 29, "Individual_protein"], [0, 0, "Individual_protein"],
+            [6, 6, "Individual_protein"]],
+            [[68, 70, "Individual_protein"], [66, 66, "Individual_protein"]]],
+        "relations": [[[29, 29, 0, 0, "PPI"], [0, 0, 66, 66, "PPI"]],
+            [[68, 70, 0, 0, "PPI"]]]}
+
+        parameters:
+            doc_dict, dict: dygiepp-formatted doc
+
+        returns:
+            doc_dict_corrected, dict: dict with sentence splits corrected
+        """
+        # Get the sentence start and end indices
+        sent_idxs = []
+        for i, sent in enumerate(doc_dict['sentences']):
+            if i == 0:
+                sent_start = 0
+            else:
+                sent_start = sent_idxs[i-1][1] + 1
+            sent_end = sent_start + len(sent)  - 1
+            sent_idxs.append((sent_start, sent_end))
+
+        # For each entity and relation, check if it crosses sentence boundaries
+        sents_to_join = []
+        for i in range(len(doc_dict['sentences'])):
+            
+            for ent in doc_dict['ner'][i]:
+                e_start = ent[0]
+                e_end = ent[1]
+                ent_sent_mems = []
+                for i, sent in enumerate(sent_idxs):
+                    if sent[0] <= e_start <= sent[1]:
+                        ent_sent_mems.append(i)
+                    if sent[0] <= e_end <= sent[1]:
+                        ent_sent_mems.append(i)
+                if ent_sent_mems[0] != ent_sent_mems[1]:
+                    ent_sent_mems = tuple(sorted(ent_sent_mems))
+                    sents_to_join.append(ent_sent_mems)
+
+            for rel in doc_dict['relations'][i]:
+                e1_start = rel[0]
+                e2_start = rel[2]
+                rel_sent_mems = []
+                for i, sent in enumerate(sent_idxs):
+                    if sent[0] <= e1_start <= sent[1]:
+                        rel_sent_mems.append(i)
+                    if sent[0] <= e2_start <= sent[1]:
+                        rel_sent_mems.append(i)
+                if rel_sent_mems[0] != rel_sent_mems[1]:
+                    rel_sent_mems = tuple(sorted(rel_sent_mems))
+                    sents_to_join.append(rel_sent_mems)
+            
+        sents_to_join = list(set([tuple(pair) for pair in sents_to_join]))
+
+    
+        # Join sentences that need it
+        if len(sents_to_join) == 0:
+            doc_dict_corrected = doc_dict
+            return doc_dict_corrected
+        else:
+            doc_dict_corrected = {'doc_key': doc_dict['doc_key'], 'dataset': doc_dict['dataset']}
+            for key in ['sentences', 'ner', 'relations']:
+
+                # If there are multiples, we need to do some extra processing
+                if len(sents_to_join) > 1:
+
+                    # Merge continuous joins
+                    sents_to_join = AnnotatedDoc.merge_mult_splits(sents_to_join)
+
+                joined = []
+                for i, pair in enumerate(sents_to_join):
+                    # Add all sentences before the first to join
+                    first_idx = min(pair)
+                    if i == 0:
+                        add_cand = doc_dict[key][:first_idx]
+                        if len(add_cand) > 0:
+                            joined.extend(add_cand)
+    
+                    # Join the group and add
+                    last_idx = max(pair)
+                    sent_list = doc_dict[key][first_idx:last_idx+1]
+                    sents_merged = [tok for sent in sent_list for tok in sent]
+                    sents_merged = [sents_merged]
+                    joined.extend(sents_merged)
+    
+                    # If it's the last merge, add the rest
+                    if (not i == len(doc_dict[key]) - 1) and (i == len(sents_to_join) - 1):
+                        add_cand = doc_dict[key][last_idx + 1:]
+                        if len(add_cand) > 0:
+                            joined.extend(add_cand)
+
+                    # If it's not and there's a gap before next merge, add those
+                    elif (last_idx + 1 != sents_to_join[i+1][0]):
+                        add_cand = doc_dict[key][last_idx + 1: sents_to_join[i+1][0]]
+                        if len(add_cand) > 0:
+                            joined.extend(add_cand)
+    
+                # Add to new doc
+                doc_dict_corrected[key] = joined
+
+            if len(sents_to_join) >= 1:
+                print(f'{len(sents_to_join)} sentence joins were performed '
+                            f'to fix erroneous sentence splits in doc {doc_dict["doc_key"]}')
+    
+            return doc_dict_corrected
+
+    @staticmethod
+    def merge_mult_splits(sents_to_join):
+        """
+        Given a list of sentence index pairs, determine if any represent multi-
+        joins (a sentence that was split into multiple fragments), and get the
+        first and last indices of the continuous split to join.
+
+        parameters:
+            sents_to_join, list of tuples: pairs of sentence indices
+
+        returns:
+            final_pairings, list of tuples: first and last indices of
+                continuous splits
+        """
+        # First sort by the first index
+        srtd = sorted(sents_to_join, key=lambda x: x[0])
+
+        # Do a common-sense check that the end indices don't overlap
+        end_overlaps = [True if srtd[i][1] > srtd[i+1][0] else False
+                            for i in range(len(srtd) - 1)]
+        assert not any(end_overlaps), ('One or more pairs of sentences to join '
+                                        'overlaps another')
+
+        # Then get the indices where continuous joins stop
+        break_idxs = []
+        for i in range(len(srtd)-1):
+            if srtd[i][1] != srtd[i+1][0]:
+                break_idxs.append(i)
+        break_idxs = [-1] + break_idxs 
+
+        # If the only break is at 0, we can just return the list
+        if break_idxs == [-1] and len(sents_to_join) == 1:
+            final_pairings = srtd
+            return final_pairings
+        else:
+            final_pairings = []
+
+        # Use break indices to get the start and end indices of continuous joins
+        for i in range(len(break_idxs)):
+            if i == len(break_idxs) - 1:
+                cont_join = srtd[break_idxs[i]+1:]
+                cont_join = (cont_join[0][0], cont_join[-1][1])
+            else:
+                cont_join = srtd[break_idxs[i]+1: break_idxs[i+1]+1]
+                cont_join = (cont_join[0][0], cont_join[-1][1])
+
+            final_pairings.append(cont_join)
+
+        return final_pairings
+
 
 class Ent:
     def __init__(self, line):