Skip to content

Commit

Permalink
Merge pull request #1050 from griffithlab/iedb_api_incorrect_data
Browse files Browse the repository at this point in the history
Handle various IEDB outputs with incorrect data
  • Loading branch information
susannasiebert authored Jan 17, 2024
2 parents cc994c5 + 159bdcd commit d5499ed
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 15 deletions.
6 changes: 4 additions & 2 deletions pvactools/lib/call_iedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,18 @@ def main(args_input = sys.argv[1:]):
)
parser.add_argument('--tmp-dir',
help="Location to write tmp files to")
parser.add_argument('--log-dir',
help="Location to write log files to")
args = parser.parse_args(args_input)

prediction_class = getattr(sys.modules[__name__], args.method)
prediction_class_object = prediction_class()

try:
(response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir)
(response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir, log_dir=args.log_dir)
except Exception as err:
if str(err) == 'len(peptide_list) != len(scores)':
(response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir)
(response_text, output_mode) = prediction_class_object.predict(args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries, tmp_dir=args.tmp_dir, log_dir=args.log_dir)
else:
raise err

Expand Down
3 changes: 3 additions & 0 deletions pvactools/lib/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ def call_iedb(self, chunks):
'-e', self.iedb_executable,
'-l', str(epl),
'--tmp-dir', self.tmp_dir,
'--log-dir', self.log_dir(),
]
argument_sets.append(arguments)

Expand Down Expand Up @@ -645,6 +646,8 @@ def call_iedb(self, chunks, length):
'-r', str(self.iedb_retries),
'-e', self.iedb_executable,
'-l', str(length),
'--tmp-dir', self.tmp_dir,
'--log-dir', self.log_dir(),
]
argument_sets.append(arguments)

Expand Down
71 changes: 58 additions & 13 deletions pvactools/lib/prediction_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from Bio import SeqIO
import random
import uuid
import io
from datetime import datetime

class IEDB(metaclass=ABCMeta):
Expand Down Expand Up @@ -51,7 +52,26 @@ def filter_response(cls, response_text):
def check_length_valid_for_allele(self, length, allele):
return True

def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None):
def check_iedb_api_response_matches(self, input_file, response_text, epitope_length):
input_peptides = set()
with open(input_file) as input_fh:
for record in SeqIO.parse(input_fh, "fasta"):
seq = record.seq
input_peptides.update([seq[i:i+epitope_length] for i in range(0, len(seq)-epitope_length+1)])

output_peptides = set()
for record in csv.DictReader(io.StringIO(response_text), delimiter="\t"):
if 'peptide' in record:
output_peptides.add(record['peptide'])

return (input_peptides == output_peptides, input_peptides, output_peptides)


def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None):
if log_dir is not None:
log_file = os.path.join(log_dir, "iedb.log")
else:
log_file = None
if iedb_executable_path is not None:
arguments = [sys.executable]
arguments.extend(self.iedb_executable_params(iedb_executable_path, self.iedb_prediction_method, allele, input_file, epitope_length))
Expand All @@ -63,32 +83,57 @@ def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb
return (response_text, 'wb')
else:
with open(input_file, 'r') as input_fh:
sequence_text = input_fh.read()
data = {
'sequence_text': input_fh.read(),
'sequence_text': sequence_text,
'method': self.iedb_prediction_method,
'allele': allele.replace('-DPB', '/DPB').replace('-DQB', '/DQB'),
'length': epitope_length,
'user_tool': 'pVac-seq',
}

response = requests.post(self.url, data=data)
(peptides_match, input_peptides, output_peptides) = self.check_iedb_api_response_matches(input_file, response.text, epitope_length)
retries = 0
while (response.status_code == 500 or response.status_code == 403 or response.text.count("\n") == 1) and retries < iedb_retries:
if response.text.count("\n") == 1:
print("No data returned. Retrying.")
print(datetime.now())
print(data)
while (response.status_code == 500 or response.status_code == 403 or not peptides_match) and retries < iedb_retries:
if response.status_code == 200 and not peptides_match:
log_text = "IEDB API Output doesn't match input. Retrying.\n"
log_text += "{}\n".format(datetime.now())
log_text += "Inputs:\n"
log_text += "{}\n".format(data)
log_text += "Output:\n"
log_text += "{}\n".format(response.text)
if log_file:
with open(log_file, "a") as log_fh:
log_fh.write(log_text)
else:
print(log_text)

random.seed(uuid.uuid4().int)
time.sleep(random.randint(30,90) * retries)
retries += 1
print("IEDB: Retry %s of %s" % (retries, iedb_retries))
response = requests.post(self.url, data=data)
(peptides_match, input_peptides, output_peptides) = self.check_iedb_api_response_matches(input_file, response.text, epitope_length)

if response.status_code != 200:
sys.exit("Error posting request to IEDB.\n%s" % response.text)
response_text = response.text
if not peptides_match:
log_text = "Error. IEDB API Output doesn't match input and number of retries exceeded."
log_text += "{}\n".format(datetime.now())
log_text += "Inputs:\n"
log_text += "{}\n".format(data)
log_text += "Output:\n"
log_text += "{}\n".format(response.text)
if log_file:
with open(log_file, "a") as log_fh:
log_fh.write(log_text)
else:
print(log_text)
sys.exit("Error. IEDB API Output doesn't match input and number of retries exceeded.")

output_mode = 'w'
return (response_text, 'w')
return (response.text, 'w')

class MHCnuggets(metaclass=ABCMeta):
def check_length_valid_for_allele(self, length, allele):
Expand All @@ -101,7 +146,7 @@ def valid_allele_names_for_class(self, class_type):
with open(alleles_file_name, 'r') as fh:
return list(filter(None, fh.read().split('\n')))

def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, class_type, tmp_dir=None):
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, class_type, tmp_dir=None, log_dir=None):
tmp_output_file = tempfile.NamedTemporaryFile('r', dir=tmp_dir, delete=False)
script = os.path.join(os.path.dirname(os.path.realpath(__file__)), "call_mhcnuggets.py")
arguments = ["python", script, input_file, allele, str(epitope_length), class_type, tmp_output_file.name]
Expand Down Expand Up @@ -312,7 +357,7 @@ def determine_neoepitopes(self, sequence, length):
epitopes[i+1] = sequence[i:i+length]
return epitopes

def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None):
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None):
results = pd.DataFrame()
all_epitopes = []
for record in SeqIO.parse(input_file, "fasta"):
Expand Down Expand Up @@ -370,7 +415,7 @@ def valid_lengths_for_allele(self, allele):
def mhcnuggets_allele(self, allele):
return allele.replace('*', '')

def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None):
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None):
return MHCnuggets.predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, 'I', tmp_dir=tmp_dir)

class IEDBMHCI(MHCI, IEDB, metaclass=ABCMeta):
Expand Down Expand Up @@ -464,7 +509,7 @@ def valid_lengths_for_allele(self, allele):
def mhcnuggets_allele(self,allele):
return "HLA-{}".format(allele).replace('*', '')

def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None):
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, tmp_dir=None, log_dir=None):
return MHCnuggets.predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, 'II', tmp_dir=tmp_dir)

class IEDBMHCII(MHCII, IEDB, metaclass=ABCMeta):
Expand Down

0 comments on commit d5499ed

Please sign in to comment.