Skip to content

Commit

Permalink
Merge pull request #1063 from griffithlab/bigmhc
Browse files Browse the repository at this point in the history
Add BigMHC and DeepImmuno prediction algorithms
  • Loading branch information
susannasiebert authored Feb 13, 2024
2 parents d48bbda + 13217d0 commit 4f008ed
Show file tree
Hide file tree
Showing 20 changed files with 20,820 additions and 570 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ jobs:
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']

steps:
- name: Maximize build space
uses: AdityaGarg8/remove-unwanted-software@v2
with:
remove-android: 'true'
remove-dotnet: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
Expand Down
44 changes: 23 additions & 21 deletions pvactools/lib/aggregate_all_epitopes.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def determine_used_prediction_algorithms(self):
potential_algorithms = PredictionClass.prediction_methods()
prediction_algorithms = []
for algorithm in potential_algorithms:
if algorithm == 'NetMHCpanEL' or algorithm == 'NetMHCIIpanEL':
if algorithm in ['NetMHCpanEL', 'NetMHCIIpanEL', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
continue
if "{} MT IC50 Score".format(algorithm) in headers or "{} IC50 Score".format(algorithm) in headers:
prediction_algorithms.append(algorithm)
Expand All @@ -129,7 +129,7 @@ def problematic_positions_exist(self):

def determine_used_el_algorithms(self):
headers = pd.read_csv(self.input_file, delimiter="\t", nrows=0).columns.tolist()
potential_algorithms = ["MHCflurryEL Processing", "MHCflurryEL Presentation", "NetMHCpanEL", "NetMHCIIpanEL"]
potential_algorithms = ["MHCflurryEL Processing", "MHCflurryEL Presentation", "NetMHCpanEL", "NetMHCIIpanEL", "BigMHC_EL", 'BigMHC_IM', 'DeepImmuno']
prediction_algorithms = []
for algorithm in potential_algorithms:
if "{} MT Score".format(algorithm) in headers or "{} Score".format(algorithm) in headers:
Expand All @@ -152,7 +152,7 @@ def determine_columns_used_for_aggregation(self, prediction_algorithms, el_algor
used_columns.extend(["{} WT Percentile".format(algorithm), "{} MT Percentile".format(algorithm)])
for algorithm in el_algorithms:
used_columns.extend(["{} WT Score".format(algorithm), "{} MT Score".format(algorithm)])
if algorithm != "MHCflurryEL Processing":
if algorithm not in ["MHCflurryEL Processing", "BigMHC_EL", "BigMHC_IM", 'DeepImmuno']:
used_columns.extend(["{} WT Percentile".format(algorithm), "{} MT Percentile".format(algorithm)])
if self.problematic_positions_exist():
used_columns.append("Problematic Positions")
Expand Down Expand Up @@ -319,7 +319,6 @@ def get_sub_df(self, all_epitopes_df, key):
df = (all_epitopes_df[lambda x: (x['Chromosome'] == key[0]) & (x['Start'] == key[1]) & (x['Stop'] == key[2]) & (x['Reference'] == key[3]) & (x['Variant'] == key[4])]).copy()
df['Variant Type'] = df['Variant Type'].cat.add_categories('NA')
df['Mutation Position'] = df['Mutation Position'].cat.add_categories('NA')
df.fillna(value="NA", inplace=True)
df['annotation'] = df[['Transcript', 'Gene Name', 'Mutation', 'Protein Position']].agg('-'.join, axis=1)
df['key'] = key_str
return (df, key_str)
Expand Down Expand Up @@ -371,13 +370,13 @@ def is_anchor_residue_pass(self, mutation):
if '-' in position:
d_ind = position.index('-')
if all(pos in anchors for pos in range(int(position[0:d_ind]), int(position[d_ind+1:])+1)):
if mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] == "NA":
if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
anchor_residue_pass = False
elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
anchor_residue_pass = False
elif position != "NA":
if int(float(position)) in anchors:
if mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] == "NA":
if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
anchor_residue_pass = False
elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
anchor_residue_pass = False
Expand Down Expand Up @@ -409,7 +408,7 @@ def get_tier(self, mutation, vaf_clonal):
tsl_pass = True
if mutation["Transcript Support Level"] == "Not Supported":
pass
elif mutation["Transcript Support Level"] == "NA":
elif pd.isna(mutation["Transcript Support Level"]):
tsl_pass = False
else:
if mutation["Transcript Support Level"] > self.maximum_transcript_support_level:
Expand Down Expand Up @@ -505,6 +504,9 @@ def get_good_binders(self, df):
def get_unique_good_binders(self, good_binders):
return pd.DataFrame(good_binders.groupby(['HLA Allele', 'MT Epitope Seq']).size().reset_index())

def replace_nas(self, items):
return ["NA" if pd.isna(x) else x for x in items]

def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algorithms):
peptides = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
good_peptides = good_binders["MT Epitope Seq"].unique()
Expand Down Expand Up @@ -542,10 +544,10 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
for index, line in good_binders_peptide_annotation.to_dict(orient='index').items():
ic50s[line['HLA Allele']] = line['{} {} IC50 Score'.format(top_score_metric, peptide_type)]
percentiles[line['HLA Allele']] = line['{} {} Percentile'.format(top_score_metric, peptide_type)]
ic50_calls[line['HLA Allele']] = [line["{} {} IC50 Score".format(algorithm, peptide_type)] for algorithm in prediction_algorithms]
percentile_calls[line['HLA Allele']] = [line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in prediction_algorithms]
el_calls[line['HLA Allele']] = [line["{} {} Score".format(algorithm, peptide_type)] for algorithm in el_algorithms]
el_percentile_calls[line['HLA Allele']] = ['NA' if algorithm == 'MHCflurryEL Processing' else line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in el_algorithms]
ic50_calls[line['HLA Allele']] = self.replace_nas([line["{} {} IC50 Score".format(algorithm, peptide_type)] for algorithm in prediction_algorithms])
percentile_calls[line['HLA Allele']] = self.replace_nas([line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in prediction_algorithms])
el_calls[line['HLA Allele']] = self.replace_nas([line["{} {} Score".format(algorithm, peptide_type)] for algorithm in el_algorithms])
el_percentile_calls[line['HLA Allele']] = self.replace_nas(['NA' if algorithm in ['MHCflurryEL Processing', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno'] else line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in el_algorithms])
if peptide_type == 'MT' and not self.is_anchor_residue_pass(line):
anchor_fails.append(line['HLA Allele'])
sorted_ic50s = []
Expand All @@ -559,8 +561,8 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
sorted_percentiles.append(percentiles[hla_type])
else:
sorted_percentiles.append('X')
results[peptide]['ic50s_{}'.format(peptide_type)] = sorted_ic50s
results[peptide]['percentiles_{}'.format(peptide_type)] = sorted_percentiles
results[peptide]['ic50s_{}'.format(peptide_type)] = self.replace_nas(sorted_ic50s)
results[peptide]['percentiles_{}'.format(peptide_type)] = self.replace_nas(sorted_percentiles)
individual_ic50_calls[peptide_type] = ic50_calls
individual_percentile_calls[peptide_type] = percentile_calls
individual_el_calls[peptide_type] = el_calls
Expand All @@ -577,7 +579,7 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
results[peptide]['individual_el_calls'] = individual_el_calls
results[peptide]['individual_el_percentile_calls'] = individual_el_percentile_calls
wt_peptide = good_binders_peptide_annotation.iloc[0]['WT Epitope Seq']
if wt_peptide == 'NA':
if pd.isna(wt_peptide):
variant_type = good_binders_peptide_annotation.iloc[0]['Variant Type']
if variant_type == 'FS':
wt_peptide = 'FS-NA'
Expand All @@ -589,8 +591,8 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
peptides[set_name]['peptides'] = self.sort_peptides(results)
sorted_transcripts = self.sort_transcripts(annotations, good_binders)
peptides[set_name]['transcripts'] = list(sorted_transcripts.Annotation)
peptides[set_name]['transcript_expr'] = list(sorted_transcripts.Expr)
peptides[set_name]['tsl'] = list(sorted_transcripts.TSL)
peptides[set_name]['transcript_expr'] = self.replace_nas(list(sorted_transcripts.Expr))
peptides[set_name]['tsl'] = self.replace_nas(list(sorted_transcripts.TSL))
peptides[set_name]['biotype'] = list(sorted_transcripts.Biotype)
peptides[set_name]['transcript_length'] = [int(l) for l in list(sorted_transcripts.Length)]
peptides[set_name]['transcript_count'] = len(annotations)
Expand Down Expand Up @@ -655,7 +657,7 @@ def assemble_result_line(self, best, key, vaf_clonal, hla, anno_count, peptide_c
tier = self.get_tier(mutation=best, vaf_clonal=vaf_clonal)

problematic_positions = best['Problematic Positions'] if 'Problematic Positions' in best else 'None'
tsl = best['Transcript Support Level'] if best['Transcript Support Level'] in ["NA", "Not Supported"] else round(best['Transcript Support Level'])
tsl = best['Transcript Support Level'] if best['Transcript Support Level'] == "Not Supported" or pd.isna(best['Transcript Support Level']) else round(best['Transcript Support Level'])

out_dict = { 'ID': key }
out_dict.update({ k.replace('HLA-', ''):v for k,v in sorted(hla.items()) })
Expand Down Expand Up @@ -691,11 +693,11 @@ def get_metrics(self, df, peptides, best):
'transcript_counts': [v['transcript_count'] for k, v in peptides.items()],
'peptide_counts': [v['peptide_count'] for k, v in peptides.items()],
'set_expr': [v['total_expr'] for k, v in peptides.items()],
'DNA VAF': 'NA' if best['Tumor DNA VAF'] == 'NA' else float(best['Tumor DNA VAF']),
'RNA VAF': 'NA' if best['Tumor RNA VAF'] == 'NA' else float(best['Tumor RNA VAF']),
'gene_expr': 'NA' if best['Gene Expression'] == 'NA' else float(best['Gene Expression']),
'DNA VAF': 'NA' if pd.isna(best['Tumor DNA VAF']) else float(best['Tumor DNA VAF']),
'RNA VAF': 'NA' if pd.isna(best['Tumor RNA VAF']) else float(best['Tumor RNA VAF']),
'gene_expr': 'NA' if pd.isna(best['Gene Expression']) else float(best['Gene Expression']),
'best_peptide_mt': best['MT Epitope Seq'],
'best_peptide_wt': best['WT Epitope Seq'],
'best_peptide_wt': 'NA' if pd.isna(best['WT Epitope Seq']) else best['WT Epitope Seq'],
'best_hla_allele': best['HLA Allele'],
}

Expand Down
29 changes: 23 additions & 6 deletions pvactools/lib/output_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,12 @@ def get_scores(self, line, method):
}
else:
return {'ic50': float(line['ic50'])}
elif method.lower() == 'deepimmuno':
return {'score': float(line['immunogenicity'])}
elif method.lower() == 'bigmhc_el':
return {'score': float(line['BigMHC_EL'])}
elif method.lower() == 'bigmhc_im':
return {'score': float(line['BigMHC_IM'])}
elif method.lower() == 'netmhcpan_el':
return {'score': float(line['score'])}
elif method.lower() == 'netmhciipan_el':
Expand Down Expand Up @@ -583,7 +589,12 @@ def output_headers(self):
self.flurry_headers(headers)

pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
if method == 'netmhcpan_el' or method == 'netmhciipan_el':
if method in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
headers.append("%s WT Score" % pretty_method)
headers.append("%s MT Score" % pretty_method)
continue

if method in ['netmhcpan_el', 'netmhciipan_el']:
headers.append("%s WT Score" % pretty_method)
headers.append("%s MT Score" % pretty_method)
else:
Expand Down Expand Up @@ -624,7 +635,7 @@ def add_pretty_row(self, row, entries, prediction_method, pretty_method, suffix)
row['MHCflurryEL Processing %s' % suffix.replace("IC50 ", "")] = s
elif st == 'mhcflurry_presentation_percentile':
row['MHCflurryEL Presentation %s' % suffix.replace("IC50 ", "")] = s
elif pretty_method == 'NetMHCpanEL' or pretty_method == 'NetMHCIIpanEL':
elif pretty_method in ['NetMHCpanEL', 'NetMHCIIpanEL', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
row['%s %s' % (pretty_method, suffix.replace("IC50 ", ""))] = s
else:
row['%s %s' % (pretty_method, suffix)] = s
Expand Down Expand Up @@ -726,8 +737,9 @@ def execute(self):
pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
self.add_pretty_row(row, wt_scores, method, pretty_method, 'WT IC50 Score')
self.add_pretty_row(row, mt_scores, method, pretty_method, 'MT IC50 Score')
self.add_pretty_row(row, wt_percentiles, method, pretty_method, 'WT Percentile')
self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'MT Percentile')
if pretty_method not in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
self.add_pretty_row(row, wt_percentiles, method, pretty_method, 'WT Percentile')
self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'MT Percentile')

for (tsv_key, row_key) in zip(['gene_expression', 'transcript_expression', 'normal_vaf', 'tdna_vaf', 'trna_vaf'], ['Gene Expression', 'Transcript Expression', 'Normal VAF', 'Tumor DNA VAF', 'Tumor RNA VAF']):
if tsv_key in tsv_entry:
Expand Down Expand Up @@ -927,7 +939,11 @@ def output_headers(self):
elif self.flurry_state == 'both':
self.flurry_headers(headers)
pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
if method == 'netmhcpan_el' or method == 'netmhciipan_el':
if method in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
headers.append("%s Score" % pretty_method)
continue

if method in ['netmhcpan_el', 'netmhciipan_el']:
headers.append("%s Score" % pretty_method)
else:
headers.append("%s IC50 Score" % pretty_method)
Expand Down Expand Up @@ -977,7 +993,8 @@ def execute(self):
for method in self.prediction_methods():
pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
self.add_pretty_row(row, mt_scores, method, pretty_method, 'IC50 Score')
self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'Percentile')
if pretty_method not in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'Percentile')
if self.add_sample_name:
row['Sample Name'] = self.sample_name
tsv_writer.writerow(row)
Expand Down
2 changes: 1 addition & 1 deletion pvactools/lib/post_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_flurry_state(self):
def is_el(self, algorithm):
if algorithm == 'MHCflurry' and self.flurry_state == 'EL_only':
return True
if algorithm in ['NetMHCIIpanEL', 'NetMHCpanEL']:
if algorithm in ['NetMHCIIpanEL', 'NetMHCpanEL', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
return True
return False

Expand Down
Loading

0 comments on commit 4f008ed

Please sign in to comment.