Add pathogen-cluster-mutations command

Adds a new top-level pathogen-cluster-mutations command that allows users to pass a reference sequence, an alignment, and a table of cluster assignments and get a table of mutations found in each cluster at some minimum count and frequency. This is a direct port of the cluster_mutation.py script [1] from the cartography project. It exists as its own separate command so users (including us) can find mutations for any predefined genetic groups including clusters, Nextstrain clades, MCCs, Pango lineages, etc. Closes #20 [1] https://github.com/blab/cartography/blob/717deb1142e1a14f6b74c5a6e794e902dda4aa1c/notebooks/scripts/cluster_mutation.py
blab · Aug 21, 2024 · 0538bb6 · 0538bb6
1 parent 5975884
commit 0538bb6
Show file tree

Hide file tree

Showing 6 changed files with 219 additions and 3 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,13 @@
 # CHANGELOG
 
+## 3.1.0
+
+### Features
+
+* pathogen-cluster-mutations: Add this new top-level command to allow users to create a table of mutations that appear in clusters or other previously defined genetic groups ([#36][])
+
+[#36]: https://github.com/blab/pathogen-embed/pull/36
+
 ## 3.0.0
 
 ### Major Changes

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name='pathogen-embed',
-    version='3.0.0',
+    version='3.1.0',
     description='Reduced dimension embeddings for pathogen sequences',
     url='https://github.com/blab/pathogen-embed/',
     author='Sravani Nanduri <[email protected]> , John Huddleston <[email protected]>',
@@ -54,7 +54,8 @@
         "console_scripts": [
             "pathogen-embed = pathogen_embed.__main__:run_embed",
             "pathogen-distance = pathogen_embed.__main__:run_distance",
-            "pathogen-cluster = pathogen_embed.__main__:run_cluster"
+            "pathogen-cluster = pathogen_embed.__main__:run_cluster",
+            "pathogen-cluster-mutations = pathogen_embed.__main__:run_cluster_mutations",
         ]
     }
 )
diff --git a/src/pathogen_embed/__main__.py b/src/pathogen_embed/__main__.py
@@ -90,7 +90,7 @@ def make_parser_cluster():
     options_group.add_argument("--label-attribute", help="the name of the cluster used to label the column in the resulting dataframe")
     options_group.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
     options_group.add_argument("--min-size", type=int, default=10, help="minimum cluster size for HDBSCAN")
-    options_group.add_argument("--min-samples", type=int, default=5, help="minimum number of sample to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
+    options_group.add_argument("--min-samples", type=int, default=5, help="minimum number of samples to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
     options_group.add_argument("--distance-threshold", type=float, help="The float value for the distance threshold by which to cluster data in the embedding and assign labels via HDBSCAN. If no value is given in distance-threshold, the default distance threshold of 0.0 will be used.")
 
     output_group = parser.add_argument_group(
@@ -102,6 +102,29 @@ def make_parser_cluster():
 
     return parser
 
+def make_parser_cluster_mutations():
+    parser = argparse.ArgumentParser(
+        description="""Find mutations associated with clusters.
+        First, find pairwise mutations between each sequence in the given alignment and the given reference.
+        Then, filter to mutations that are present in at least a fixed number or proportion of sequences in cluster.
+        Finally, output a table of mutations as '<position><allele>' with the number and list of clusters each mutation appears in.
+        """,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument("--reference-sequence", required=True, help="FASTA file of the reference sequence used for the alignment")
+    parser.add_argument("--alignment", required=True, help="a FASTA file of an alignment used to create clusters")
+    parser.add_argument("--clusters", required=True, help="a CSV data frame with cluster annotations")
+    parser.add_argument("--cluster-column", required=True, help="the name of the column in the given data frame with cluster labels")
+    parser.add_argument("--ignored-clusters", nargs="+", default=["-1"], help="a list of cluster labels to ignore when calculating cluster-specific mutations")
+    parser.add_argument("--valid-characters", nargs="+", default=["A", "C", "T", "G", "-"], help="a list of valid characters to consider in pairwise comparisons with the reference")
+    parser.add_argument("--min-allele-count", type=int, default=10, help="the minimum number of samples in a cluster with a given alternate allele required to include the allele in cluster-specific mutations")
+    parser.add_argument("--min-allele-frequency", type=float, default=0.5, help="the minimum frequency of an allele in a cluster to include allele in cluster-specific mutations")
+    parser.add_argument("--verbose", action="store_true", help="print additional details to the terminal")
+    parser.add_argument("--output", help="a CSV data frame with mutations per cluster")
+
+    return parser
+
 def run_embed():
     args = make_parser_embed().parse_args(argv[1:])
 
@@ -119,3 +142,9 @@ def run_cluster():
 
     from .pathogen_embed import cluster
     return cluster(args)
+
+def run_cluster_mutations():
+    args = make_parser_cluster_mutations().parse_args(argv[1:])
+
+    from .pathogen_embed import cluster_mutations
+    return cluster_mutations(args)
diff --git a/src/pathogen_embed/pathogen_embed.py b/src/pathogen_embed/pathogen_embed.py
@@ -764,3 +764,122 @@ def distance(args):
         args.indel_distance,
     )
     distance_matrix.to_csv(args.output)
+
+def cluster_mutations(args):
+    # Load the reference sequence used to produce the given alignment.
+    reference = str(next(Bio.SeqIO.parse(args.reference_sequence, "fasta")).seq)
+
+    # Create a list of gap sites at the beginning and end of the reference
+    # sequence to ignore from the alignment. This happens when the reference is
+    # missing sequences that are present in the consensus sequences. These
+    # differences usually reflect missing data in the original sequencing of the
+    # reference and not a biologically-relevant insertion.
+    ignored_sites = []
+    site = 0
+    while reference[site] == "-" and site < len(reference):
+        ignored_sites.append(site)
+        site += 1
+
+    site = len(reference) - 1
+    while reference[site] == "-" and site >= 0:
+        ignored_sites.append(site)
+        site -= 1
+
+    if args.verbose:
+        print(f"Ignoring leading and trailing gaps in the reference at sites: {ignored_sites}", file=sys.stderr)
+        print(f"Valid characters: {args.valid_characters}", file=sys.stderr)
+
+    sequences_by_name = OrderedDict()
+
+    for sequence in Bio.SeqIO.parse(args.alignment, "fasta"):
+        sequences_by_name[sequence.id] = str(sequence.seq)
+
+    sequence_names = list(sequences_by_name.keys())
+
+    # Index cluster name by sequence name from metadata.
+    clade_annotations = pd.read_csv(args.clusters)
+    clade_annotations["cluster"] = clade_annotations[args.cluster_column]
+    clade_annotations = clade_annotations[["strain", "cluster"]]
+
+    clade_annotations = clade_annotations.set_index("strain")
+
+    # Remove records for cluster labels we ignore. For example, we often ignore
+    # cluster labels that represent unclustered samples ("-1").
+    clade_annotations["cluster"] = clade_annotations["cluster"].astype(str)
+    clade_annotations = clade_annotations[
+        ~clade_annotations["cluster"].isin(args.ignored_clusters)
+    ].copy()
+
+    # Find mutations per cluster relative to reference as Python dictionary of sets indexed by cluster name
+    strains_per_cluster_reference = {}
+    for clade in clade_annotations.groupby("cluster"):
+        strains_per_cluster = set(clade[1].index)
+        strains_per_cluster_reference[clade[0]] = strains_per_cluster
+
+    all_mutation_counts = []
+    for cluster, cluster_strains in strains_per_cluster_reference.items():
+        mutations = []
+        for strain in cluster_strains:
+            strain_sequence = sequences_by_name[strain]
+
+            # Compare each strain to the reference.
+            for site in range(len(reference)):
+                if all((
+                    site not in ignored_sites,
+                    strain_sequence[site] != reference[site],
+                    strain_sequence[site] in args.valid_characters,
+                    reference[site] in args.valid_characters,
+                )):
+                    # Report the site in 1-based coordinates.
+                    mutations.append(
+                        {
+                            "cluster": cluster,
+                            "strain": strain,
+                            "site": site + 1,
+                            "reference_allele": reference[site],
+                            "alternate_allele": strain_sequence[site],
+                        }
+                    )
+
+        mutations = pd.DataFrame(mutations)
+        mutations["mutation"] = mutations.apply(
+            lambda row: str(row["site"]) + row["alternate_allele"],
+            axis=1,
+        )
+
+        mutation_counts = mutations.groupby("mutation")["strain"].count().reset_index().rename(columns={"strain": "count"})
+        mutation_counts["frequency"] = mutation_counts["count"] / len(cluster_strains)
+
+        # Filter by minimum allele count and frequency.
+        mutation_counts = mutation_counts[mutation_counts["count"] >= args.min_allele_count].copy()
+        mutation_counts = mutation_counts[mutation_counts["frequency"] >= args.min_allele_frequency].copy()
+
+        mutation_counts["cluster"] = cluster
+
+        all_mutation_counts.append(mutation_counts)
+
+    # Count the number of clusters with each distinct alternate allele and find
+    # the specific clusters with each allele.
+    all_mutation_counts = pd.concat(all_mutation_counts, ignore_index=True)
+
+    mutation_cluster_counts = all_mutation_counts.groupby(
+        "mutation"
+    ).aggregate(
+        cluster_count=("cluster", "count"),
+        distinct_clusters=("cluster", "unique")
+    ).reset_index()
+
+    # Remove mutations that are present in all clusters (reference-only mutations).
+    total_clusters = all_mutation_counts["cluster"].drop_duplicates().shape[0]
+    mutation_cluster_counts = mutation_cluster_counts[mutation_cluster_counts["cluster_count"] < total_clusters].copy()
+
+    # Format distinct clusters as a comma-delimited list of names.
+    mutation_cluster_counts["distinct_clusters"] = mutation_cluster_counts["distinct_clusters"].apply(
+        lambda clusters: ",".join(clusters)
+    )
+
+    # Annotate column used for clusters.
+    mutation_cluster_counts["cluster_column"] = args.cluster_column
+
+    # Save mutations and their cluster information.
+    mutation_cluster_counts.to_csv(args.output, index=False)
diff --git a/tests/data/h3n2_ha_reference.fasta b/tests/data/h3n2_ha_reference.fasta
@@ -0,0 +1,26 @@
+>U26830.1 Influenza A virus (A/Beijing/32/1992(H3N2)) hemagglutinin gene, complete cds
+ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACA
+ACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAA
+TGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGT
+CCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATG
+GCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGA
+TGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGAC
+TTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCT
+TTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAA
+TGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTA
+TATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCG
+GGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGA
+CATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAA
+AGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCA
+TTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCA
+AAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCA
+ATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATT
+CTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACT
+GAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGG
+AGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTC
+TTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAAC
+AAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGAC
+AATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACA
+ACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGC
+CATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATT
+AGGTGTAACATTTGCATTTGA
diff --git a/tests/pathogen-cluster-mutations.t b/tests/pathogen-cluster-mutations.t
@@ -0,0 +1,33 @@
+Run pathogen-embed with PCA on a H3N2 HA alignment.
+
+  $ pathogen-embed \
+  >   --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
+  >   --output-dataframe embed.csv \
+  >   pca \
+  >   --components 2
+
+Find clusters from the embedding.
+
+  $ pathogen-cluster \
+  >   --embedding embed.csv \
+  >   --label-attribute cluster_label \
+  >   --distance-threshold 0.5 \
+  >   --output-dataframe cluster_embed.csv
+
+Find mutations per cluster.
+
+  $ pathogen-cluster-mutations \
+  >   --reference-sequence $TESTDIR/data/h3n2_ha_reference.fasta \
+  >   --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
+  >   --clusters cluster_embed.csv \
+  >   --cluster-column cluster_label \
+  >   --min-allele-count 10 \
+  >   --min-allele-frequency 0.5 \
+  >   --output mutations_cluster_embed.csv
+
+Confirm that the mutation table output has the correct structure and more than one row.
+
+  $ head -n 1 mutations_cluster_embed.csv
+  mutation,cluster_count,distinct_clusters,cluster_column
+
+  $ [[ $(sed 1d mutations_cluster_embed.csv | wc -l | sed 's/ //g') > 0 ]]