Skip to content

Commit

Permalink
Add pathogen-cluster-mutations command
Browse files Browse the repository at this point in the history
Adds a new top-level pathogen-cluster-mutations command that allows
users to pass a reference sequence, an alignment, and a table of cluster
assignments and get a table of mutations found in each cluster at some
minimum count and frequency.

This is a direct port of the cluster_mutation.py script [1] from the
cartography project. It exists as its own separate command so
users (including us) can find mutations for any predefined genetic
groups including clusters, Nextstrain clades, MCCs, Pango lineages, etc.

Closes #20

[1] https://github.com/blab/cartography/blob/717deb1142e1a14f6b74c5a6e794e902dda4aa1c/notebooks/scripts/cluster_mutation.py
  • Loading branch information
huddlej committed Aug 21, 2024
1 parent 5975884 commit 0538bb6
Show file tree
Hide file tree
Showing 6 changed files with 219 additions and 3 deletions.
8 changes: 8 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# CHANGELOG

## 3.1.0

### Features

* pathogen-cluster-mutations: Add this new top-level command to allow users to create a table of mutations that appear in clusters or other previously defined genetic groups ([#36][])

[#36]: https://github.com/blab/pathogen-embed/pull/36

## 3.0.0

### Major Changes
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='pathogen-embed',
version='3.0.0',
version='3.1.0',
description='Reduced dimension embeddings for pathogen sequences',
url='https://github.com/blab/pathogen-embed/',
author='Sravani Nanduri <[email protected]> , John Huddleston <[email protected]>',
Expand Down Expand Up @@ -54,7 +54,8 @@
"console_scripts": [
"pathogen-embed = pathogen_embed.__main__:run_embed",
"pathogen-distance = pathogen_embed.__main__:run_distance",
"pathogen-cluster = pathogen_embed.__main__:run_cluster"
"pathogen-cluster = pathogen_embed.__main__:run_cluster",
"pathogen-cluster-mutations = pathogen_embed.__main__:run_cluster_mutations",
]
}
)
31 changes: 30 additions & 1 deletion src/pathogen_embed/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def make_parser_cluster():
options_group.add_argument("--label-attribute", help="the name of the cluster used to label the column in the resulting dataframe")
options_group.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
options_group.add_argument("--min-size", type=int, default=10, help="minimum cluster size for HDBSCAN")
options_group.add_argument("--min-samples", type=int, default=5, help="minimum number of sample to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
options_group.add_argument("--min-samples", type=int, default=5, help="minimum number of samples to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
options_group.add_argument("--distance-threshold", type=float, help="The float value for the distance threshold by which to cluster data in the embedding and assign labels via HDBSCAN. If no value is given in distance-threshold, the default distance threshold of 0.0 will be used.")

output_group = parser.add_argument_group(
Expand All @@ -102,6 +102,29 @@ def make_parser_cluster():

return parser

def make_parser_cluster_mutations():
parser = argparse.ArgumentParser(
description="""Find mutations associated with clusters.
First, find pairwise mutations between each sequence in the given alignment and the given reference.
Then, filter to mutations that are present in at least a fixed number or proportion of sequences in cluster.
Finally, output a table of mutations as '<position><allele>' with the number and list of clusters each mutation appears in.
""",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)

parser.add_argument("--reference-sequence", required=True, help="FASTA file of the reference sequence used for the alignment")
parser.add_argument("--alignment", required=True, help="a FASTA file of an alignment used to create clusters")
parser.add_argument("--clusters", required=True, help="a CSV data frame with cluster annotations")
parser.add_argument("--cluster-column", required=True, help="the name of the column in the given data frame with cluster labels")
parser.add_argument("--ignored-clusters", nargs="+", default=["-1"], help="a list of cluster labels to ignore when calculating cluster-specific mutations")
parser.add_argument("--valid-characters", nargs="+", default=["A", "C", "T", "G", "-"], help="a list of valid characters to consider in pairwise comparisons with the reference")
parser.add_argument("--min-allele-count", type=int, default=10, help="the minimum number of samples in a cluster with a given alternate allele required to include the allele in cluster-specific mutations")
parser.add_argument("--min-allele-frequency", type=float, default=0.5, help="the minimum frequency of an allele in a cluster to include allele in cluster-specific mutations")
parser.add_argument("--verbose", action="store_true", help="print additional details to the terminal")
parser.add_argument("--output", help="a CSV data frame with mutations per cluster")

return parser

def run_embed():
args = make_parser_embed().parse_args(argv[1:])

Expand All @@ -119,3 +142,9 @@ def run_cluster():

from .pathogen_embed import cluster
return cluster(args)

def run_cluster_mutations():
args = make_parser_cluster_mutations().parse_args(argv[1:])

from .pathogen_embed import cluster_mutations
return cluster_mutations(args)
119 changes: 119 additions & 0 deletions src/pathogen_embed/pathogen_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,3 +764,122 @@ def distance(args):
args.indel_distance,
)
distance_matrix.to_csv(args.output)

def cluster_mutations(args):
# Load the reference sequence used to produce the given alignment.
reference = str(next(Bio.SeqIO.parse(args.reference_sequence, "fasta")).seq)

# Create a list of gap sites at the beginning and end of the reference
# sequence to ignore from the alignment. This happens when the reference is
# missing sequences that are present in the consensus sequences. These
# differences usually reflect missing data in the original sequencing of the
# reference and not a biologically-relevant insertion.
ignored_sites = []
site = 0
while reference[site] == "-" and site < len(reference):
ignored_sites.append(site)
site += 1

site = len(reference) - 1
while reference[site] == "-" and site >= 0:
ignored_sites.append(site)
site -= 1

if args.verbose:
print(f"Ignoring leading and trailing gaps in the reference at sites: {ignored_sites}", file=sys.stderr)
print(f"Valid characters: {args.valid_characters}", file=sys.stderr)

sequences_by_name = OrderedDict()

for sequence in Bio.SeqIO.parse(args.alignment, "fasta"):
sequences_by_name[sequence.id] = str(sequence.seq)

sequence_names = list(sequences_by_name.keys())

# Index cluster name by sequence name from metadata.
clade_annotations = pd.read_csv(args.clusters)
clade_annotations["cluster"] = clade_annotations[args.cluster_column]
clade_annotations = clade_annotations[["strain", "cluster"]]

clade_annotations = clade_annotations.set_index("strain")

# Remove records for cluster labels we ignore. For example, we often ignore
# cluster labels that represent unclustered samples ("-1").
clade_annotations["cluster"] = clade_annotations["cluster"].astype(str)
clade_annotations = clade_annotations[
~clade_annotations["cluster"].isin(args.ignored_clusters)
].copy()

# Find mutations per cluster relative to reference as Python dictionary of sets indexed by cluster name
strains_per_cluster_reference = {}
for clade in clade_annotations.groupby("cluster"):
strains_per_cluster = set(clade[1].index)
strains_per_cluster_reference[clade[0]] = strains_per_cluster

all_mutation_counts = []
for cluster, cluster_strains in strains_per_cluster_reference.items():
mutations = []
for strain in cluster_strains:
strain_sequence = sequences_by_name[strain]

# Compare each strain to the reference.
for site in range(len(reference)):
if all((
site not in ignored_sites,
strain_sequence[site] != reference[site],
strain_sequence[site] in args.valid_characters,
reference[site] in args.valid_characters,
)):
# Report the site in 1-based coordinates.
mutations.append(
{
"cluster": cluster,
"strain": strain,
"site": site + 1,
"reference_allele": reference[site],
"alternate_allele": strain_sequence[site],
}
)

mutations = pd.DataFrame(mutations)
mutations["mutation"] = mutations.apply(
lambda row: str(row["site"]) + row["alternate_allele"],
axis=1,
)

mutation_counts = mutations.groupby("mutation")["strain"].count().reset_index().rename(columns={"strain": "count"})
mutation_counts["frequency"] = mutation_counts["count"] / len(cluster_strains)

# Filter by minimum allele count and frequency.
mutation_counts = mutation_counts[mutation_counts["count"] >= args.min_allele_count].copy()
mutation_counts = mutation_counts[mutation_counts["frequency"] >= args.min_allele_frequency].copy()

mutation_counts["cluster"] = cluster

all_mutation_counts.append(mutation_counts)

# Count the number of clusters with each distinct alternate allele and find
# the specific clusters with each allele.
all_mutation_counts = pd.concat(all_mutation_counts, ignore_index=True)

mutation_cluster_counts = all_mutation_counts.groupby(
"mutation"
).aggregate(
cluster_count=("cluster", "count"),
distinct_clusters=("cluster", "unique")
).reset_index()

# Remove mutations that are present in all clusters (reference-only mutations).
total_clusters = all_mutation_counts["cluster"].drop_duplicates().shape[0]
mutation_cluster_counts = mutation_cluster_counts[mutation_cluster_counts["cluster_count"] < total_clusters].copy()

# Format distinct clusters as a comma-delimited list of names.
mutation_cluster_counts["distinct_clusters"] = mutation_cluster_counts["distinct_clusters"].apply(
lambda clusters: ",".join(clusters)
)

# Annotate column used for clusters.
mutation_cluster_counts["cluster_column"] = args.cluster_column

# Save mutations and their cluster information.
mutation_cluster_counts.to_csv(args.output, index=False)
26 changes: 26 additions & 0 deletions tests/data/h3n2_ha_reference.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
>U26830.1 Influenza A virus (A/Beijing/32/1992(H3N2)) hemagglutinin gene, complete cds
ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACA
ACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAA
TGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGT
CCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATG
GCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGA
TGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGAC
TTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCT
TTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAA
TGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTA
TATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCG
GGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGA
CATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAA
AGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCA
TTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCA
AAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCA
ATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATT
CTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACT
GAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGG
AGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTC
TTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAAC
AAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGAC
AATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACA
ACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGC
CATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATT
AGGTGTAACATTTGCATTTGA
33 changes: 33 additions & 0 deletions tests/pathogen-cluster-mutations.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Run pathogen-embed with PCA on a H3N2 HA alignment.

$ pathogen-embed \
> --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
> --output-dataframe embed.csv \
> pca \
> --components 2

Find clusters from the embedding.

$ pathogen-cluster \
> --embedding embed.csv \
> --label-attribute cluster_label \
> --distance-threshold 0.5 \
> --output-dataframe cluster_embed.csv

Find mutations per cluster.

$ pathogen-cluster-mutations \
> --reference-sequence $TESTDIR/data/h3n2_ha_reference.fasta \
> --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
> --clusters cluster_embed.csv \
> --cluster-column cluster_label \
> --min-allele-count 10 \
> --min-allele-frequency 0.5 \
> --output mutations_cluster_embed.csv

Confirm that the mutation table output has the correct structure and more than one row.

$ head -n 1 mutations_cluster_embed.csv
mutation,cluster_count,distinct_clusters,cluster_column

$ [[ $(sed 1d mutations_cluster_embed.csv | wc -l | sed 's/ //g') > 0 ]]

0 comments on commit 0538bb6

Please sign in to comment.