griffithlab · susannasiebert · Jan 8, 2025 · Jan 8, 2025
diff --git a/docs/pvacseq/output_files.rst b/docs/pvacseq/output_files.rst
@@ -279,7 +279,13 @@ transcripts covered by those epitopes, as well as the HLA alleles that those
 epitopes are well-binding to. Lastly, the report will bin variants into tiers
 that offer suggestions as to the suitability of variants for use in vaccines.
 
-Only epitopes meeting the ``--aggregate-inclusion-binding-threshold`` are included in this report (default: 5000).
+Additionally, a metrics.json file gets created, containing metadata about the
+Best Peptide as well as alternate neoantigen canddiates for each variant. This
+file can be loaded into pVACview in conjunction with the aggregated report in
+order to visualize the candidates. In order to limit the size of the
+metrics.json file, only a limited number of neoantigen candidates are included
+in this file. Only neoantigen candidates meeting the ``--aggregate-inclusion-binding-threshold``
+are included in this file (default: 5000).
 If the number of unique epitopes for a mutation meeting this threshold exceeds the
 ``--aggregate-inclusion-count-limit``, only the top n epitopes up to this
 limit are included (default: 15). The method for selecting the top n epitopes is analogous to
@@ -293,8 +299,12 @@ anchor criteria was passed, the MT IC50 score, the transcript length,
 and the MT percentile. From this sorted list the top n entries are selected up
 to the ``--aggregate-inclusion-count-limit``.
 
+If the Best Peptide does not meet the aggregate inclusion criteria, it will be still be
+included in the metrics.json file and counted in the ``Num Included
+Peptides``.
+
 Whether the median or the lowest binding affinity metrics are used for determining the
-included eptiopes, selecting the best-scoring epitope, and which values are output in the ``IC50 MT``,
+included epitopes, selecting the best-scoring epitope, and which values are output in the ``IC50 MT``,
 ``IC50 WT``, ``%ile MT``, and ``%ile WT`` columns is controlled by the
 ``--top-score-metric`` parameter.
 
@@ -371,9 +381,7 @@ included eptiopes, selecting the best-scoring epitope, and which values are outp
 Best Peptide Criteria
 _____________________
 
-To determine the Best Peptide, all peptides meeting the
-``--aggregate-inclusion-threshold`` and ``--aggregate-inclusion-count-limit``
-(see above) are evaluated as follows:
+To determine the Best Peptide, all peptides for a variant are evaluated as follows:
 
 - Pick all entries with a variant transcript that have a ``protein_coding`` Biotype
 - Of the remaining entries, pick the ones with a variant transcript having

diff --git a/docs/pvacsplice/output_files.rst b/docs/pvacsplice/output_files.rst
@@ -298,6 +298,10 @@ and ``%ile MT`` columns is controlled by the ``--top-score-metric`` parameter.
      - A list of positions in the Best Peptide that are problematic.
        ``None`` if the ``--problematic-pos`` parameter was not set during
        the pVACseq run
+   * - ``Num Included Peptides``
+     - The number of included peptides according to the
+       ``--aggregate-inclusion-binding-threshold`` and
+       ``--aggregate-inclusion-count-limit``
    * - ``Num Passing Peptides``
      - The number of unique well-binding peptides for this mutation.
    * - ``IC50 MT``

diff --git a/pvactools/lib/aggregate_all_epitopes.py b/pvactools/lib/aggregate_all_epitopes.py
@@ -97,21 +97,18 @@ def get_best_mut_line(self, df, key, prediction_algorithms, el_algorithms, perce
 
         #these are all lines meeting the aggregate inclusion binding threshold
         included_df = self.get_included_df(df)
-        if len(included_df) > 0:
-            peptide_hla_counts = self.get_unique_peptide_hla_counts(included_df)
-            hla_counts = Counter(peptide_hla_counts["HLA Allele"])
-            hla = dict(map(lambda x : (x, hla_counts[x]) if x in hla_counts else (x, ""), self.hla_types))
-            #get a list of all unique gene/transcript/aa_change combinations
-            #store a count of all unique peptides that passed
-            (peptides, anno_count) = self.get_included_df_metrics(included_df, prediction_algorithms, el_algorithms, percentile_algorithms)
-            included_peptide_count = self.calculate_unique_peptide_count(included_df)
-            good_binder_count = self.calculate_good_binder_count(included_df)
-        else:
-            hla = dict(map(lambda x : (x, ""), self.hla_types))
-            peptides = {}
-            anno_count = self.get_default_annotation_count()
-            included_peptide_count = 0
-            good_binder_count = 0
+        best_df = pd.DataFrame.from_dict([best])
+        if not best_df.index.isin(included_df.index).all():
+            included_df = pd.concat([included_df, best_df])
+        best_df = best_df.to_dict()
+        peptide_hla_counts = self.get_unique_peptide_hla_counts(included_df)
+        hla_counts = Counter(peptide_hla_counts["HLA Allele"])
+        hla = dict(map(lambda x : (x, hla_counts[x]) if x in hla_counts else (x, ""), self.hla_types))
+        #get a list of all unique gene/transcript/aa_change combinations
+        #store a count of all unique peptides that passed
+        (peptides, anno_count) = self.get_included_df_metrics(included_df, prediction_algorithms, el_algorithms, percentile_algorithms)
+        included_peptide_count = self.calculate_unique_peptide_count(included_df)
+        good_binder_count = self.calculate_good_binder_count(included_df)
 
         #assemble the line
         out_dict = self.assemble_result_line(best, key, vaf_clonal, hla, anno_count, included_peptide_count, good_binder_count)
@@ -362,7 +359,6 @@ def calculate_clonal_vaf(self):
 
     def read_input_file(self, used_columns, dtypes):
         df = pd.read_csv(self.input_file, delimiter='\t', float_precision='high', low_memory=False, na_values="NA", keep_default_na=False, usecols=used_columns, dtype=dtypes)
-        df = df.dropna(subset=["{} MT IC50 Score".format(self.mt_top_score_metric)]).reset_index()
         df = df.astype({"{} MT IC50 Score".format(self.mt_top_score_metric):'float'})
         return df
 
@@ -409,7 +405,7 @@ def get_best_binder(self, df):
             "Transcript Support Level",
             "Transcript Length",
         ], inplace=True, ascending=[True, True, False])
-        return anchor_residue_pass_df.iloc[0].to_dict()
+        return anchor_residue_pass_df.iloc[0]
 
     def is_anchor_residue_pass(self, mutation):
         if self.use_allele_specific_binding_thresholds and mutation['HLA Allele'] in self.allele_specific_binding_thresholds:

diff --git a/tests/test_aggregate_all_epitopes.py b/tests/test_aggregate_all_epitopes.py
@@ -8,7 +8,7 @@
 from pvactools.lib.aggregate_all_epitopes import PvacseqAggregateAllEpitopes, PvacfuseAggregateAllEpitopes, PvacbindAggregateAllEpitopes, PvacspliceAggregateAllEpitopes
 from tests.utils import *
 
-class AggregateAllEptiopesTests(unittest.TestCase):
+class AggregateAllEpitopesTests(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         #locate the bin and test_data directories