diff --git a/pvactools/lib/call_iedb.py b/pvactools/lib/call_iedb.py index b4053a14..85d45c36 100644 --- a/pvactools/lib/call_iedb.py +++ b/pvactools/lib/call_iedb.py @@ -30,16 +30,18 @@ def main(args_input = sys.argv[1:]): ) parser.add_argument('--tmp-dir', help="Location to write tmp files to") + parser.add_argument('--log-dir', + help="Location to write log files to") args = parser.parse_args(args_input) prediction_class = getattr(sys.modules[__name__], args.method) prediction_class_object = prediction_class() try: - (response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir) + (response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir, log_dir=args.log_dir) except Exception as err: if str(err) == 'len(peptide_list) != len(scores)': - (response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir) + (response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir, log_dir=args.log_dir) else: raise err diff --git a/pvactools/lib/pipeline.py b/pvactools/lib/pipeline.py index a74978e0..ff7f58d3 100644 --- a/pvactools/lib/pipeline.py +++ b/pvactools/lib/pipeline.py @@ -339,6 +339,7 @@ def call_iedb(self, chunks): '-e', self.iedb_executable, '-l', str(epl), '--tmp-dir', self.tmp_dir, + '--log-dir', self.log_dir(), ] argument_sets.append(arguments) @@ -645,6 +646,8 @@ def call_iedb(self, chunks, length): '-r', str(self.iedb_retries), '-e', self.iedb_executable, '-l', str(length), + '--tmp-dir', self.tmp_dir, + '--log-dir', self.log_dir(), ] argument_sets.append(arguments) diff --git a/pvactools/lib/prediction_class.py b/pvactools/lib/prediction_class.py index 8cfe5303..93188b0b 100644 --- a/pvactools/lib/prediction_class.py +++ b/pvactools/lib/prediction_class.py @@ -13,6 +13,7 @@ from Bio import SeqIO import random import uuid +import io from datetime import datetime class IEDB(metaclass=ABCMeta): @@ -51,7 +52,26 @@ def filter_response(cls, response_text): def check_length_valid_for_allele(self, length, allele): return True - def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None): + def check_iedb_api_response_matches(self, input_file, response_text, epitope_length): + input_peptides = set() + with open(input_file) as input_fh: + for record in SeqIO.parse(input_fh, "fasta"): + seq = record.seq + input_peptides.update([seq[i:i+epitope_length] for i in range(0, len(seq)-epitope_length+1)]) + + output_peptides = set() + for record in csv.DictReader(io.StringIO(response_text), delimiter="\t"): + if 'peptide' in record: + output_peptides.add(record['peptide']) + + return (input_peptides == output_peptides, input_peptides, output_peptides) + + + def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None): + if log_dir is not None: + log_file = os.path.join(log_dir, "iedb.log") + else: + log_file = None if iedb_executable_path is not None: arguments = [sys.executable] arguments.extend(self.iedb_executable_params(iedb_executable_path, self.iedb_prediction_method, allele, input_file, epitope_length)) @@ -63,8 +83,9 @@ def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb return (response_text, 'wb') else: with open(input_file, 'r') as input_fh: + sequence_text = input_fh.read() data = { - 'sequence_text': input_fh.read(), + 'sequence_text': sequence_text, 'method': self.iedb_prediction_method, 'allele': allele.replace('-DPB', '/DPB').replace('-DQB', '/DQB'), 'length': epitope_length, @@ -72,23 +93,47 @@ def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb } response = requests.post(self.url, data=data) + (peptides_match, input_peptides, output_peptides) = self.check_iedb_api_response_matches(input_file, response.text, epitope_length) retries = 0 - while (response.status_code == 500 or response.status_code == 403 or response.text.count("\n") == 1) and retries < iedb_retries: - if response.text.count("\n") == 1: - print("No data returned. Retrying.") - print(datetime.now()) - print(data) + while (response.status_code == 500 or response.status_code == 403 or not peptides_match) and retries < iedb_retries: + if response.status_code == 200 and not peptides_match: + log_text = "IEDB API Output doesn't match input. Retrying.\n" + log_text += "{}\n".format(datetime.now()) + log_text += "Inputs:\n" + log_text += "{}\n".format(data) + log_text += "Output:\n" + log_text += "{}\n".format(response.text) + if log_file: + with open(log_file, "a") as log_fh: + log_fh.write(log_text) + else: + print(log_text) + random.seed(uuid.uuid4().int) time.sleep(random.randint(30,90) * retries) retries += 1 print("IEDB: Retry %s of %s" % (retries, iedb_retries)) response = requests.post(self.url, data=data) + (peptides_match, input_peptides, output_peptides) = self.check_iedb_api_response_matches(input_file, response.text, epitope_length) if response.status_code != 200: sys.exit("Error posting request to IEDB.\n%s" % response.text) - response_text = response.text + if not peptides_match: + log_text = "Error. IEDB API Output doesn't match input and number of retries exceeded." + log_text += "{}\n".format(datetime.now()) + log_text += "Inputs:\n" + log_text += "{}\n".format(data) + log_text += "Output:\n" + log_text += "{}\n".format(response.text) + if log_file: + with open(log_file, "a") as log_fh: + log_fh.write(log_text) + else: + print(log_text) + sys.exit("Error. IEDB API Output doesn't match input and number of retries exceeded.") + output_mode = 'w' - return (response_text, 'w') + return (response.text, 'w') class MHCnuggets(metaclass=ABCMeta): def check_length_valid_for_allele(self, length, allele): @@ -101,7 +146,7 @@ def valid_allele_names_for_class(self, class_type): with open(alleles_file_name, 'r') as fh: return list(filter(None, fh.read().split('\n'))) - def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, class_type, tmp_dir=None): + def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, class_type, tmp_dir=None, log_dir=None): tmp_output_file = tempfile.NamedTemporaryFile('r', dir=tmp_dir, delete=False) script = os.path.join(os.path.dirname(os.path.realpath(__file__)), "call_mhcnuggets.py") arguments = ["python", script, input_file, allele, str(epitope_length), class_type, tmp_output_file.name] @@ -312,7 +357,7 @@ def determine_neoepitopes(self, sequence, length): epitopes[i+1] = sequence[i:i+length] return epitopes - def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None): + def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None): results = pd.DataFrame() all_epitopes = [] for record in SeqIO.parse(input_file, "fasta"): @@ -370,7 +415,7 @@ def valid_lengths_for_allele(self, allele): def mhcnuggets_allele(self, allele): return allele.replace('*', '') - def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None): + def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None): return MHCnuggets.predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, 'I', tmp_dir=tmp_dir) class IEDBMHCI(MHCI, IEDB, metaclass=ABCMeta): @@ -464,7 +509,7 @@ def valid_lengths_for_allele(self, allele): def mhcnuggets_allele(self,allele): return "HLA-{}".format(allele).replace('*', '') - def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None): + def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None): return MHCnuggets.predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, 'II', tmp_dir=tmp_dir) class IEDBMHCII(MHCII, IEDB, metaclass=ABCMeta):