Merge pull request #1063 from griffithlab/bigmhc

Add BigMHC and DeepImmuno prediction algorithms
griffithlab · Feb 13, 2024 · 4f008ed · 4f008ed
2 parents d48bbda + 13217d0
commit 4f008ed
Show file tree

Hide file tree

Showing 20 changed files with 20,820 additions and 570 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -17,6 +17,13 @@ jobs:
         python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
 
     steps:
+      - name: Maximize build space
+        uses: AdityaGarg8/remove-unwanted-software@v2
+        with:
+          remove-android: 'true'
+          remove-dotnet: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v2

diff --git a/pvactools/lib/aggregate_all_epitopes.py b/pvactools/lib/aggregate_all_epitopes.py
@@ -117,7 +117,7 @@ def determine_used_prediction_algorithms(self):
         potential_algorithms = PredictionClass.prediction_methods()
         prediction_algorithms = []
         for algorithm in potential_algorithms:
-            if algorithm == 'NetMHCpanEL' or algorithm == 'NetMHCIIpanEL':
+            if algorithm in ['NetMHCpanEL', 'NetMHCIIpanEL', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
                 continue
             if "{} MT IC50 Score".format(algorithm) in headers or "{} IC50 Score".format(algorithm) in headers:
                 prediction_algorithms.append(algorithm)
@@ -129,7 +129,7 @@ def problematic_positions_exist(self):
 
     def determine_used_el_algorithms(self):
         headers = pd.read_csv(self.input_file, delimiter="\t", nrows=0).columns.tolist()
-        potential_algorithms = ["MHCflurryEL Processing", "MHCflurryEL Presentation", "NetMHCpanEL", "NetMHCIIpanEL"]
+        potential_algorithms = ["MHCflurryEL Processing", "MHCflurryEL Presentation", "NetMHCpanEL", "NetMHCIIpanEL", "BigMHC_EL", 'BigMHC_IM', 'DeepImmuno']
         prediction_algorithms = []
         for algorithm in potential_algorithms:
             if "{} MT Score".format(algorithm) in headers or "{} Score".format(algorithm) in headers:
@@ -152,7 +152,7 @@ def determine_columns_used_for_aggregation(self, prediction_algorithms, el_algor
             used_columns.extend(["{} WT Percentile".format(algorithm), "{} MT Percentile".format(algorithm)])
         for algorithm in el_algorithms:
             used_columns.extend(["{} WT Score".format(algorithm), "{} MT Score".format(algorithm)])
-            if algorithm != "MHCflurryEL Processing":
+            if algorithm not in ["MHCflurryEL Processing", "BigMHC_EL", "BigMHC_IM", 'DeepImmuno']:
                 used_columns.extend(["{} WT Percentile".format(algorithm), "{} MT Percentile".format(algorithm)])
         if self.problematic_positions_exist():
             used_columns.append("Problematic Positions")
@@ -319,7 +319,6 @@ def get_sub_df(self, all_epitopes_df, key):
         df = (all_epitopes_df[lambda x: (x['Chromosome'] == key[0]) & (x['Start'] == key[1]) & (x['Stop'] == key[2]) & (x['Reference'] == key[3]) & (x['Variant'] == key[4])]).copy()
         df['Variant Type'] = df['Variant Type'].cat.add_categories('NA')
         df['Mutation Position'] = df['Mutation Position'].cat.add_categories('NA')
-        df.fillna(value="NA", inplace=True)
         df['annotation'] = df[['Transcript', 'Gene Name', 'Mutation', 'Protein Position']].agg('-'.join, axis=1)
         df['key'] = key_str
         return (df, key_str)
@@ -371,13 +370,13 @@ def is_anchor_residue_pass(self, mutation):
         if '-' in position:
             d_ind = position.index('-')
             if all(pos in anchors for pos in range(int(position[0:d_ind]), int(position[d_ind+1:])+1)):
-                if mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] == "NA":
+                if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
                     anchor_residue_pass = False
                 elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
                     anchor_residue_pass = False
         elif position != "NA":
             if int(float(position)) in anchors:
-                if mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] == "NA":
+                if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
                     anchor_residue_pass = False
                 elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
                     anchor_residue_pass = False
@@ -409,7 +408,7 @@ def get_tier(self, mutation, vaf_clonal):
         tsl_pass = True
         if mutation["Transcript Support Level"] == "Not Supported":
             pass
-        elif mutation["Transcript Support Level"] == "NA":
+        elif pd.isna(mutation["Transcript Support Level"]):
             tsl_pass = False
         else:
             if mutation["Transcript Support Level"] > self.maximum_transcript_support_level:
@@ -505,6 +504,9 @@ def get_good_binders(self, df):
     def get_unique_good_binders(self, good_binders):
         return pd.DataFrame(good_binders.groupby(['HLA Allele', 'MT Epitope Seq']).size().reset_index())
 
+    def replace_nas(self, items):
+        return ["NA" if pd.isna(x) else x for x in items]
+
     def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algorithms):
         peptides = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
         good_peptides = good_binders["MT Epitope Seq"].unique()
@@ -542,10 +544,10 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
                         for index, line in good_binders_peptide_annotation.to_dict(orient='index').items():
                             ic50s[line['HLA Allele']] = line['{} {} IC50 Score'.format(top_score_metric, peptide_type)]
                             percentiles[line['HLA Allele']] = line['{} {} Percentile'.format(top_score_metric, peptide_type)]
-                            ic50_calls[line['HLA Allele']] = [line["{} {} IC50 Score".format(algorithm, peptide_type)] for algorithm in prediction_algorithms]
-                            percentile_calls[line['HLA Allele']] = [line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in prediction_algorithms]
-                            el_calls[line['HLA Allele']] = [line["{} {} Score".format(algorithm, peptide_type)] for algorithm in el_algorithms]
-                            el_percentile_calls[line['HLA Allele']] = ['NA' if algorithm == 'MHCflurryEL Processing' else line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in el_algorithms]
+                            ic50_calls[line['HLA Allele']] = self.replace_nas([line["{} {} IC50 Score".format(algorithm, peptide_type)] for algorithm in prediction_algorithms])
+                            percentile_calls[line['HLA Allele']] = self.replace_nas([line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in prediction_algorithms])
+                            el_calls[line['HLA Allele']] = self.replace_nas([line["{} {} Score".format(algorithm, peptide_type)] for algorithm in el_algorithms])
+                            el_percentile_calls[line['HLA Allele']] = self.replace_nas(['NA' if algorithm in ['MHCflurryEL Processing', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno'] else line["{} {} Percentile".format(algorithm, peptide_type)] for algorithm in el_algorithms])
                             if peptide_type == 'MT' and not self.is_anchor_residue_pass(line):
                                 anchor_fails.append(line['HLA Allele'])
                         sorted_ic50s = []
@@ -559,8 +561,8 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
                                 sorted_percentiles.append(percentiles[hla_type])
                             else:
                                 sorted_percentiles.append('X')
-                        results[peptide]['ic50s_{}'.format(peptide_type)] = sorted_ic50s
-                        results[peptide]['percentiles_{}'.format(peptide_type)] = sorted_percentiles
+                        results[peptide]['ic50s_{}'.format(peptide_type)] = self.replace_nas(sorted_ic50s)
+                        results[peptide]['percentiles_{}'.format(peptide_type)] = self.replace_nas(sorted_percentiles)
                         individual_ic50_calls[peptide_type] = ic50_calls
                         individual_percentile_calls[peptide_type] = percentile_calls
                         individual_el_calls[peptide_type] = el_calls
@@ -577,7 +579,7 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
                     results[peptide]['individual_el_calls'] = individual_el_calls
                     results[peptide]['individual_el_percentile_calls'] = individual_el_percentile_calls
                     wt_peptide = good_binders_peptide_annotation.iloc[0]['WT Epitope Seq']
-                    if wt_peptide == 'NA':
+                    if pd.isna(wt_peptide):
                         variant_type = good_binders_peptide_annotation.iloc[0]['Variant Type']
                         if variant_type == 'FS':
                             wt_peptide = 'FS-NA'
@@ -589,8 +591,8 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
             peptides[set_name]['peptides'] = self.sort_peptides(results)
             sorted_transcripts = self.sort_transcripts(annotations, good_binders)
             peptides[set_name]['transcripts'] = list(sorted_transcripts.Annotation)
-            peptides[set_name]['transcript_expr'] = list(sorted_transcripts.Expr)
-            peptides[set_name]['tsl'] = list(sorted_transcripts.TSL)
+            peptides[set_name]['transcript_expr'] = self.replace_nas(list(sorted_transcripts.Expr))
+            peptides[set_name]['tsl'] = self.replace_nas(list(sorted_transcripts.TSL))
             peptides[set_name]['biotype'] = list(sorted_transcripts.Biotype)
             peptides[set_name]['transcript_length'] = [int(l) for l in list(sorted_transcripts.Length)]
             peptides[set_name]['transcript_count'] = len(annotations)
@@ -655,7 +657,7 @@ def assemble_result_line(self, best, key, vaf_clonal, hla, anno_count, peptide_c
         tier = self.get_tier(mutation=best, vaf_clonal=vaf_clonal)
 
         problematic_positions = best['Problematic Positions'] if 'Problematic Positions' in best else 'None'
-        tsl = best['Transcript Support Level'] if best['Transcript Support Level'] in ["NA", "Not Supported"] else round(best['Transcript Support Level'])
+        tsl = best['Transcript Support Level'] if best['Transcript Support Level'] == "Not Supported" or pd.isna(best['Transcript Support Level']) else round(best['Transcript Support Level'])
 
         out_dict = { 'ID': key }
         out_dict.update({ k.replace('HLA-', ''):v for k,v in sorted(hla.items()) })
@@ -691,11 +693,11 @@ def get_metrics(self, df, peptides, best):
             'transcript_counts': [v['transcript_count'] for k, v in peptides.items()],
             'peptide_counts': [v['peptide_count'] for k, v in peptides.items()],
             'set_expr': [v['total_expr'] for k, v in peptides.items()],
-            'DNA VAF': 'NA' if best['Tumor DNA VAF'] == 'NA' else float(best['Tumor DNA VAF']),
-            'RNA VAF': 'NA' if best['Tumor RNA VAF'] == 'NA' else float(best['Tumor RNA VAF']),
-            'gene_expr': 'NA' if best['Gene Expression'] == 'NA' else float(best['Gene Expression']),
+            'DNA VAF': 'NA' if pd.isna(best['Tumor DNA VAF']) else float(best['Tumor DNA VAF']),
+            'RNA VAF': 'NA' if pd.isna(best['Tumor RNA VAF']) else float(best['Tumor RNA VAF']),
+            'gene_expr': 'NA' if pd.isna(best['Gene Expression']) else float(best['Gene Expression']),
             'best_peptide_mt': best['MT Epitope Seq'],
-            'best_peptide_wt': best['WT Epitope Seq'],
+            'best_peptide_wt': 'NA' if pd.isna(best['WT Epitope Seq']) else best['WT Epitope Seq'],
             'best_hla_allele': best['HLA Allele'],
         }
 

diff --git a/pvactools/lib/output_parser.py b/pvactools/lib/output_parser.py
@@ -163,6 +163,12 @@ def get_scores(self, line, method):
                 }
             else:
                 return {'ic50': float(line['ic50'])}
+        elif method.lower() == 'deepimmuno':
+            return {'score': float(line['immunogenicity'])}
+        elif method.lower() == 'bigmhc_el':
+            return {'score': float(line['BigMHC_EL'])}
+        elif method.lower() == 'bigmhc_im':
+            return {'score': float(line['BigMHC_IM'])}
         elif method.lower() == 'netmhcpan_el':
             return {'score': float(line['score'])}
         elif method.lower() == 'netmhciipan_el':
@@ -583,7 +589,12 @@ def output_headers(self):
                     self.flurry_headers(headers)
 
             pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
-            if method == 'netmhcpan_el' or method == 'netmhciipan_el':
+            if method in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
+                headers.append("%s WT Score" % pretty_method)
+                headers.append("%s MT Score" % pretty_method)
+                continue
+
+            if method in ['netmhcpan_el', 'netmhciipan_el']:
                 headers.append("%s WT Score" % pretty_method)
                 headers.append("%s MT Score" % pretty_method)
             else:
@@ -624,7 +635,7 @@ def add_pretty_row(self, row, entries, prediction_method, pretty_method, suffix)
                     row['MHCflurryEL Processing %s' % suffix.replace("IC50 ", "")] = s
                 elif st == 'mhcflurry_presentation_percentile':
                     row['MHCflurryEL Presentation %s' % suffix.replace("IC50 ", "")] = s
-                elif pretty_method == 'NetMHCpanEL' or pretty_method == 'NetMHCIIpanEL':
+                elif pretty_method in ['NetMHCpanEL', 'NetMHCIIpanEL', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
                     row['%s %s' % (pretty_method, suffix.replace("IC50 ", ""))] = s
                 else:
                     row['%s %s' % (pretty_method, suffix)] = s
@@ -726,8 +737,9 @@ def execute(self):
                     pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
                     self.add_pretty_row(row, wt_scores, method, pretty_method, 'WT IC50 Score')
                     self.add_pretty_row(row, mt_scores, method, pretty_method, 'MT IC50 Score')
-                    self.add_pretty_row(row, wt_percentiles, method, pretty_method, 'WT Percentile')
-                    self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'MT Percentile')
+                    if pretty_method not in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
+                        self.add_pretty_row(row, wt_percentiles, method, pretty_method, 'WT Percentile')
+                        self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'MT Percentile')
 
                 for (tsv_key, row_key) in zip(['gene_expression', 'transcript_expression', 'normal_vaf', 'tdna_vaf', 'trna_vaf'], ['Gene Expression', 'Transcript Expression', 'Normal VAF', 'Tumor DNA VAF', 'Tumor RNA VAF']):
                     if tsv_key in tsv_entry:
@@ -927,7 +939,11 @@ def output_headers(self):
                 elif self.flurry_state == 'both':
                     self.flurry_headers(headers)
             pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
-            if method == 'netmhcpan_el' or method == 'netmhciipan_el':
+            if method in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
+                headers.append("%s Score" % pretty_method)
+                continue
+
+            if method in ['netmhcpan_el', 'netmhciipan_el']:
                 headers.append("%s Score" % pretty_method)
             else:
                 headers.append("%s IC50 Score" % pretty_method)
@@ -977,7 +993,8 @@ def execute(self):
             for method in self.prediction_methods():
                 pretty_method = PredictionClass.prediction_class_name_for_iedb_prediction_method(method)
                 self.add_pretty_row(row, mt_scores, method, pretty_method, 'IC50 Score')
-                self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'Percentile')
+                if pretty_method not in ['BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
+                    self.add_pretty_row(row, mt_percentiles, method, pretty_method, 'Percentile')
             if self.add_sample_name:
                 row['Sample Name'] = self.sample_name
             tsv_writer.writerow(row)

diff --git a/pvactools/lib/post_processor.py b/pvactools/lib/post_processor.py
@@ -48,7 +48,7 @@ def get_flurry_state(self):
     def is_el(self, algorithm):
         if algorithm == 'MHCflurry' and self.flurry_state == 'EL_only':
             return True
-        if algorithm in ['NetMHCIIpanEL', 'NetMHCpanEL']:
+        if algorithm in ['NetMHCIIpanEL', 'NetMHCpanEL', 'BigMHC_EL', 'BigMHC_IM', 'DeepImmuno']:
             return True
         return False