Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow pVACvector to remove peptides in order to find a partial solution #1168

Open
wants to merge 1 commit into
base: staging
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pvactools/lib/run_argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ def pvacvector(self):
help="Number of amino acids to permit clipping from the start and/or end of peptides in order to test novel junction epitopes when the first pass on the full peptide fails.",
default=3,
)
self.parser.add_argument(
'--allow-n-peptide-exclusion', type=int,
help="If no solution is found after adding spacers and clipping peptides, attempt to find partial solutions with up to n peptides removed.",
default=2,
)

class PvacbindRunArgumentParser(RunArgumentParser):
def __init__(self):
Expand Down
192 changes: 121 additions & 71 deletions pvactools/tools/pvacvector/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import itertools
import json
import platform
import shutil

from pvactools.lib.optimal_peptide import OptimalPeptide
from pvactools.lib.vector_visualization import VectorVisualization
Expand Down Expand Up @@ -251,72 +252,72 @@ def find_optimal_path(graph, distance_matrix, seq_dict, base_output_dir, args):
peptide.copy_strategy = "slice"
peptide.save_state_on_exit = False
state, e = peptide.anneal()
print("%i distance :" % e)

names = list()
cumulative_weight = 0
all_scores = list()
problematic_junctions = []
for i in range(0, (len(state) - 1)):
names.append(state[i])
if graph.has_edge(state[i], state[i + 1]):
edge = graph[state[i]][state[i + 1]]
try:
min_score = min(min_score, edge['weight'])
except:
min_score = edge['weight']
cumulative_weight += edge['weight']
all_scores.append(str(edge['weight']))
spacer = edge['spacer']
if spacer != 'None':
names.append(spacer)
else:
problematic_junctions.append("{} - {}".format(state[i], state[i+1]))
names.append(state[-1])
if len(problematic_junctions) > 0:
return (None, "No valid junction between peptides: {}".format(", ".join(problematic_junctions)))

print("%i distance :" % e)
for id in state:
print("\t", id)

results_file = os.path.join(base_output_dir, args.sample_name + '_results.fa')
with open(results_file, 'w') as f:
names = list()
cumulative_weight = 0
all_scores = list()

problematic_junctions = []
for i in range(0, (len(state) - 1)):
names.append(state[i])
if graph.has_edge(state[i], state[i + 1]):
edge = graph[state[i]][state[i + 1]]
try:
min_score = min(min_score, edge['weight'])
except:
min_score = edge['weight']
cumulative_weight += edge['weight']
all_scores.append(str(edge['weight']))
spacer = edge['spacer']
if spacer != 'None':
names.append(spacer)
median_score = str(cumulative_weight/len(all_scores))
peptide_id_list = ','.join(names)
score_list = ','.join(all_scores)
output = list()
output.append(">")
output.append(peptide_id_list)
output.append("|Median_Junction_Score:")
output.append(median_score)
output.append("|Lowest_Junction_Score:")
output.append(str(min_score))
output.append("|All_Junction_Scores:")
output.append(score_list)
output.append("\n")
for (i, name) in enumerate(names):
if name in args.spacers:
output.append(name)
else:
full_sequence = seq_dict[name]
if i > 0:
left_name = names[i-1]
if left_name in args.spacers:
left_name = names[i-2]
left_clip = graph.edges[(left_name, name)]['right_partner_trim']
else:
problematic_junctions.append("{} - {}".format(state[i], state[i+1]))
if len(problematic_junctions) > 0:
return (None, "No valid junction between peptides: {}".format(", ".join(problematic_junctions)))
names.append(state[-1])
median_score = str(cumulative_weight/len(all_scores))
peptide_id_list = ','.join(names)
score_list = ','.join(all_scores)
output = list()
output.append(">")
output.append(peptide_id_list)
output.append("|Median_Junction_Score:")
output.append(median_score)
output.append("|Lowest_Junction_Score:")
output.append(str(min_score))
output.append("|All_Junction_Scores:")
output.append(score_list)
output.append("\n")
for (i, name) in enumerate(names):
if name in args.spacers:
output.append(name)
left_clip = 0
if i < (len(names) - 1):
right_name = names[i+1]
if right_name in args.spacers:
right_name = names[i+2]
right_clip = graph.edges[(name, right_name)]['left_partner_trim']
else:
full_sequence = seq_dict[name]
if i > 0:
left_name = names[i-1]
if left_name in args.spacers:
left_name = names[i-2]
left_clip = graph.edges[(left_name, name)]['right_partner_trim']
else:
left_clip = 0
if i < (len(names) - 1):
right_name = names[i+1]
if right_name in args.spacers:
right_name = names[i+2]
right_clip = graph.edges[(name, right_name)]['left_partner_trim']
else:
right_clip = 0
clipped_sequence = full_sequence[left_clip:len(full_sequence) - right_clip]
output.append(clipped_sequence)
right_clip = 0
clipped_sequence = full_sequence[left_clip:len(full_sequence) - right_clip]
output.append(clipped_sequence)

output.append("\n")
output.append("\n")
results_file = os.path.join(base_output_dir, args.sample_name + '_results.fa')
with open(results_file, 'w') as f:
f.write(''.join(output))
return (results_file, None)

Expand Down Expand Up @@ -451,21 +452,12 @@ def main(args_input=sys.argv[1:]):
print("No valid path found. {}".format(error))
junctions_to_process = identify_problematic_junctions(graph, seq_tuples)
else:
junctions_file = os.path.join(current_output_dir, 'junctions.tsv')
import shutil
shutil.copy(junctions_file, base_output_dir)
break
tries += 1
junctions_file = os.path.join(current_output_dir, 'junctions.tsv')
shutil.copy(junctions_file, base_output_dir)

if results_file is None:
print(
'Unable to find path. ' +
'A vaccine design using the parameters specified could not be found. Some options that you may want to consider:\n' +
'1) decreasing the acceptable junction binding score to allow more possible connections (-b parameter)\n' +
'2) using the "median" binding score instead of the "best" binding score for each junction, (best may be too conservative, -m parameter)\n' +
'3) if running with a percentile threshold set, either remove this parameter or reduce the acceptable threshold to allow more possible connections (--percentile-threshold parameter)'
)
else:
if results_file is not None:
if 'DISPLAY' in os.environ.keys():
VectorVisualization(results_file, base_output_dir, args.spacers).draw()

Expand All @@ -478,6 +470,64 @@ def main(args_input=sys.argv[1:]):
shutil.rmtree(os.path.join(base_output_dir, str(subdirectory), spacer, 'MHC_Class_I'), ignore_errors=True)
shutil.rmtree(os.path.join(base_output_dir, str(subdirectory), spacer, 'MHC_Class_II'), ignore_errors=True)

change_permissions_recursive(base_output_dir, 0o755, 0o644)
return

solution_found = False
if results_file is None:
for n_nodes_to_remove in range(1, args.allow_n_peptide_exclusion+1):
node_sets_to_remove = list(itertools.combinations(seq_keys, n_nodes_to_remove))
for node_set in node_sets_to_remove:
print("Removing nodes: {}".format(', '.join(node_set)))
#Creating output directory
current_output_dir = os.path.join(base_output_dir, "without_{}".format('_'.join(node_set)))
os.makedirs(current_output_dir, exist_ok=True)
#Creating junctions file with node_set removed
junctions_file = os.path.join(base_output_dir, 'junctions.tsv')
modified_junctions_file = os.path.join(current_output_dir, 'junctions.tsv')
with open(junctions_file) as input_fh, open(modified_junctions_file, 'w') as output_fh:
reader = csv.DictReader(input_fh, delimiter='\t')
writer = csv.DictWriter(output_fh, delimiter='\t', fieldnames=reader.fieldnames)
writer.writeheader()
for line in reader:
for node in node_set:
if line['left_peptide'] == node or line['right_peptide'] == node:
continue
writer.writerow(line)
#Modify graph to remove node_set and test it
modified_graph = graph.copy()
modified_graph.remove_nodes_from(node_set)
modified_seq_dict = seq_dict.copy()
for node in node_set:
del modified_seq_dict[node]
(valid, error) = check_graph_valid(modified_graph, modified_seq_dict)
if not valid:
print("No valid path found after removing nodes: {}".format(', '.join(node_set)))
continue
distance_matrix = create_distance_matrix(modified_graph)
(results_file, error) = find_optimal_path(modified_graph, distance_matrix, modified_seq_dict, current_output_dir, args)
if results_file is None:
print("No valid path found after removing nodes: {}".format(', '.join(node_set)))
continue
else:
solution_found = True
if 'DISPLAY' in os.environ.keys():
VectorVisualization(results_file, current_output_dir, args.spacers).draw()
dna_results_file = os.path.join(current_output_dir, args.sample_name + '_results.dna.fa')
create_dna_backtranslation(results_file, dna_results_file)
if solution_found:
break

if not solution_found:
print(
'Unable to find path. ' +
'A vaccine design using the parameters specified could not be found. Some options that you may want to consider:\n' +
'1) decreasing the acceptable junction binding score to allow more possible connections (-b parameter)\n' +
'2) using the "median" binding score instead of the "best" binding score for each junction, (best may be too conservative, -m parameter)\n' +
'3) if running with a percentile threshold set, either remove this parameter or reduce the acceptable threshold to allow more possible connections (--percentile-threshold parameter)\n' +
'4) increase the number of peptides that can be excluded from the vector (--allow-n-peptide-exclusion parameter)'
)

change_permissions_recursive(base_output_dir, 0o755, 0o644)

if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.SUMF2.G23A,MT.TP53.R157H,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PEX1.V356I|Median_Junction_Score:32636.72125|Lowest_Junction_Score:25464.36|All_Junction_Scores:35215.03,30620.2,36498.27,37632.32,36869.38,25464.36,32071.19,26723.02
KLGILFGRKSIWQKLRRLDGALSLR
KQSQWMTEVERHCPHFERCSRSDGL
LAKAKQYRLKGWGMSFLAGKKLKEG
HPRWEILIGRQNVTGGYYKREAKTK
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
FGAPAWSQPWFRGSTFVFSGGAATS
SLASTLSPSCRTTPQFFPSSYLPPH
KTTWQVWLDPFIKEEGSEEFDFILP
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.SUMF2.G23A,MT.ACSL3.S345N,MT.PRDM15.G654W,MT.PEX1.V356I,MT.TP53.R157H,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.CASP10.S654R|Median_Junction_Score:30083.295000000002|Lowest_Junction_Score:23486.97|All_Junction_Scores:24253.18,37632.32,37434.15,30563.6,30620.2,31211.58,25464.36,23486.97
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
RTGLIAHFGEGWPGWSRLRRLPDDK
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
LAKAKQYRLKGWGMSFLAGKKLKEG
DGTWICRFNGNHQPRISSLTNGVPI
FGAPAWSQPWFRGSTFVFSGGAATS
GYVWFRFVEEGWWYIQSFCNHLKKL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.CASP10.S654R,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.TP53.R157H|Median_Junction_Score:30712.94125|Lowest_Junction_Score:24345.72|All_Junction_Scores:24345.72,36498.27,28221.36,32071.19,26803.52,36869.38,25679.06,35215.03
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
HPRWEILIGRQNVTGGYYKREAKTK
FGAPAWSQPWFRGSTFVFSGGAATS
SLASTLSPSCRTTPQFFPSSYLPPH
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
KLGILFGRKSIWQKLRRLDGALSLR
KQSQWMTEVERHCPHFERCSRSDGL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.FAT3.R4848T,MT.PEX1.V356I,MT.TP53.R157H,MT.DTX3L.G501R,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.CASP10.S654R|Median_Junction_Score:28628.89|Lowest_Junction_Score:24253.18|All_Junction_Scores:26723.02,30563.6,30620.2,28746.49,36869.38,25679.06,24253.18,25576.19
SLASTLSPSCRTTPQFFPSSYLPPH
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
LAKAKQYRLKGWGMSFLAGKKLKEG
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
GYVWFRFVEEGWWYIQSFCNHLKKL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.CASP10.S654R,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.FAT3.R4848T,MT.PEX1.V356I,MT.TP53.R157H|Median_Junction_Score:28468.2575|Lowest_Junction_Score:24253.18|All_Junction_Scores:24345.72,31211.58,25464.36,29377.88,24253.18,35806.72,26723.02,30563.6
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
DGTWICRFNGNHQPRISSLTNGVPI
FGAPAWSQPWFRGSTFVFSGGAATS
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
SLASTLSPSCRTTPQFFPSSYLPPH
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.CASP10.S654R,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.PEX1.V356I,MT.TP53.R157H|Median_Junction_Score:30769.6675|Lowest_Junction_Score:24345.72|All_Junction_Scores:24345.72,36498.27,28221.36,32071.19,26803.52,36869.38,30784.3,30563.6
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
HPRWEILIGRQNVTGGYYKREAKTK
FGAPAWSQPWFRGSTFVFSGGAATS
SLASTLSPSCRTTPQFFPSSYLPPH
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.POM121C.G3107R,MT.CASP10.S654R,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.PEX1.V356I|Median_Junction_Score:28627.6125|Lowest_Junction_Score:23486.97|All_Junction_Scores:23486.97,24345.72,31211.58,25679.06,24253.18,35806.72,26803.52,37434.15
FGAPAWSQPWFRGSTFVFSGGAATS
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
DGTWICRFNGNHQPRISSLTNGVPI
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
SLASTLSPSCRTTPQFFPSSYLPPH
RTGLIAHFGEGWPGWSRLRRLPDDK
KTTWQVWLDPFIKEEGSEEFDFILP
Loading
Loading