Skip to content

Commit

Permalink
Merge pull request #320 from bacpop/sparse_dist_assign
Browse files Browse the repository at this point in the history
PopPUNK 2.7.0 candidate
  • Loading branch information
johnlees authored Aug 9, 2024
2 parents cdf1b2c + 173e65f commit 81d436d
Show file tree
Hide file tree
Showing 15 changed files with 186 additions and 147 deletions.
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.6.7'
__version__ = '2.7.0'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 2
Expand Down
182 changes: 97 additions & 85 deletions PopPUNK/assign.py

Large diffs are not rendered by default.

21 changes: 14 additions & 7 deletions PopPUNK/lineages.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import argparse
import subprocess
import pickle
import shutil
import pandas as pd
from collections import defaultdict

Expand Down Expand Up @@ -142,16 +143,18 @@ def main():
create_db(args)
elif args.query_db is not None:
query_db(args)


def create_db(args):

# Check if output files exist
if not args.overwrite:
if os.path.exists(args.output + '.csv'):
sys.stderr.write('Output file ' + args.output + '.csv exists; use --overwrite to replace it\n')
sys.exit(1)
if os.path.exists(args.db_scheme):
sys.stderr.write('Output file ' + args.db_scheme + ' exists; use --overwrite to replace it\n')
sys.exit(1)

sys.stderr.write("Identifying strains in existing database\n")
# Read in strain information
Expand Down Expand Up @@ -197,7 +200,8 @@ def create_db(args):
if num_isolates >= args.min_count:
lineage_dbs[strain] = strain_db_name
if os.path.isdir(strain_db_name) and args.overwrite:
os.rmdir(strain_db_name)
sys.stderr.write("--overwrite means {strain_db_name} will be deleted now\n")
shutil.rmtree(strain_db_name)
if not os.path.isdir(strain_db_name):
try:
os.makedirs(strain_db_name)
Expand All @@ -209,7 +213,8 @@ def create_db(args):
dest_db = os.path.join(strain_db_name,os.path.basename(strain_db_name) + '.h5')
rel_path = os.path.relpath(src_db, os.path.dirname(dest_db))
if os.path.exists(dest_db) and args.overwrite:
os.remove(dest_db)
sys.stderr.write("--overwrite means {dest_db} will be deleted now\n")
shutil.rmtree(dest_db)
elif not os.path.exists(dest_db):
os.symlink(rel_path,dest_db)
# Extract sparse distances
Expand Down Expand Up @@ -304,7 +309,7 @@ def create_db(args):


def query_db(args):

# Read querying scheme
with open(args.db_scheme, 'rb') as pickle_file:
ref_db, rlist, model_dir, clustering_file, args.clustering_col_name, distances, \
Expand Down Expand Up @@ -374,6 +379,7 @@ def query_db(args):
False, # write references - need to consider whether to support ref-only databases for assignment
distances,
False, # serial - needs to be supported for web version?
None, # stable - not supported here
args.threads,
True, # overwrite - probably OK?
False, # plot_fit - turn off for now
Expand Down Expand Up @@ -420,6 +426,7 @@ def query_db(args):
False, # write references - need to consider whether to support ref-only databases for assignment
lineage_distances,
False, # serial - needs to be supported for web version?
None, # stable - not supported here
args.threads,
True, # overwrite - probably OK?
False, # plot_fit - turn off for now
Expand All @@ -434,10 +441,10 @@ def query_db(args):
args.gpu_graph,
save_partial_query_graph = False)
overall_lineage[strain] = createOverallLineage(rank_list, lineageClustering)

# Print combined strain and lineage clustering
print_overall_clustering(overall_lineage,args.output + '.csv',qNames)


def print_overall_clustering(overall_lineage,output,include_list):

Expand All @@ -455,7 +462,7 @@ def print_overall_clustering(overall_lineage,output,include_list):
isolate_info[isolate].append(str(overall_lineage[strain][rank][isolate]))
else:
isolate_info[isolate] = [str(strain),str(overall_lineage[strain][rank][isolate])]

# Print output
with open(output,'w') as out:
out.write('id,Cluster,')
Expand Down
4 changes: 2 additions & 2 deletions PopPUNK/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -1425,7 +1425,7 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
if use_gpu:
component_assignments = cugraph.components.connectivity.connected_components(G)
component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
newClusters = [set() for rank in range(component_frequencies.size)]
newClusters = [set() for _ in range(component_frequencies.size)]
for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
component = component_assignments['labels'].iloc[isolate_index].item()
component_rank_bool = component_frequencies.index == component
Expand All @@ -1448,7 +1448,7 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
oldClusters = oldAllClusters[list(oldAllClusters.keys())[0]]
# parse all previously used clusters, including those that are merged
parsed_oldClusters = set([int(item) for sublist in (x.split('_') for x in oldClusters) for item in sublist])

new_id = max(parsed_oldClusters) + 1 # 1-indexed
while new_id in parsed_oldClusters:
new_id += 1 # in case clusters have been merged
Expand Down
7 changes: 5 additions & 2 deletions PopPUNK/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,15 @@ def storePickle(rlist, qlist, self, X, pklName):
Whether an all-vs-all self DB (for :func:`~iterDistRows`)
X (numpy.array)
n x 2 array of core and accessory distances
If None, do not save
pklName (str)
Prefix for output files
"""
with open(pklName + ".pkl", 'wb') as pickle_file:
pickle.dump([rlist, qlist, self], pickle_file)
np.save(pklName + ".npy", X)
if isinstance(X, np.ndarray):
np.save(pklName + ".npy", X)


def readPickle(pklName, enforce_self=False, distances=True):
Expand Down Expand Up @@ -266,7 +269,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
File name of CSV with isolate assignments
mode (str)
Type of file to read 'clusters', 'lineages', or 'external'
return_type (str)
return_dict (bool)
If True, return a dict with sample->cluster instead
of sets
[default = False]
Expand Down
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,24 @@ Lees JA, Harris SR, Tonkin-Hill G, Gladstone RA, Lo SW, Weiser JN, Corander J, B
Fast and flexible bacterial genomic epidemiology with PopPUNK. *Genome Research* **29**:304-316 (2019).
doi:[10.1101/gr.241455.118](https://doi.org/10.1101/gr.241455.118)

You can also run your command with `--citation` to get a [list of citations](https://poppunk.readthedocs.io/en/latest/citing.html) and a
suggested methods paragraph.
You can also run your command with `--citation` to get a [list of citations](https://poppunk.readthedocs.io/en/latest/citing.html) and a suggested methods paragraph.

## News and roadmap

The [roadmap](https://poppunk.bacpop.org/roadmap.html) can be found in the documentation.

### 2023-01-18
### 2024-08-07
PopPUNK 2.7.0 comes with two changes:
- Distance matrices `<db_name>.dists.npy` are no longer required or written when using
`poppunk_assign`, with or without `--update-db`. These can be very large, especially
with many samples, so this saves space and memory in model reuse and distribution. Note that
the `<db_name>.dists.pkl` file is still required (but this is small).
- We have added a `--stable` flag to `poppunk_assign`. Rather than merging hybrid clusters,
new samples will simply be assigned to their nearest neighbour. This implies `--serial` and
cannot be run with `--update-db`. This behaviour mimics the 'stable nomenclature' of schemes
such as [LIN](https://doi.org/10.1093/molbev/msac135).

### 2023-01-18
We have retired the PopPUNK website. Databases have been expanded, and can be
found here: https://www.bacpop.org/poppunk/.

Expand All @@ -45,11 +54,13 @@ change clusters).
If this is a common problem let us know, as we could write a script to 'upgrade'
HDBSCAN models.
See issue [#213](https://github.com/bacpop/PopPUNK/issues/213) for more details.

### 2021-03-15
We have fixed a number of bugs with may affect the use of `poppunk_assign` with
`--update-db`. We have also fixed a number of bugs with GPU distances. These are
'advanced' features and are not likely to be encountered in most cases, but if you do wish to use either of these features please make sure that you are using
`PopPUNK >=v2.4.0` with `pp-sketchlib >=v1.7.0`.

### 2020-09-30
We have discovered a bug affecting the interaction of pp-sketchlib and PopPUNK.
If you have used `PopPUNK >=v2.0.0` with `pp-sketchlib <v1.5.1` label order may
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ The advantages of PopPUNK are broadly that:
- It is fast, and scalable to over :math:`10^{5}` genomes in a single run.
- Assigning new query sequences to a cluster using an existing database is scalable even beyond this.
- Cluster names remain consistent between studies, and other cluster labels such as MLST
can be appended.
can be appended. **Please note that when used as documented PopPUNK outputs stable nomenclature**.
- Databases can be updated online (as sequences arrive).
- Online updating is equivalent to building databases from scratch.
- Databases can be kept small and managable by only keeping representative isolates.
Expand Down
2 changes: 1 addition & 1 deletion docs/model_distribution.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Database contents
A database requires the following files:

- ``.h5``. The sketch database, a HDF5 file.
- ``.dists.pkl`` and ``.dists.npy`` files. Distances for all vs all samples in the sketch database.
- ``.dists.pkl`` file. Order and names of samples in the sketch database.
- ``_fit.npz`` and ``_fit.pkl`` files. Python files which describe the model fit.
- ``_graph.gt``. The network relating distances, fit and strain assignment for all samples in the sketch database.
- ``_clusters.csv``. The strain assignment of all samples in the sketch database.
Expand Down
3 changes: 3 additions & 0 deletions docs/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ See :doc:`query_assignment` for instructions on how to use this mode.
You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme
to define your sample's strains.

If you want to avoid any merged clusters (and get 'stable nomenclature') use the
``--stable`` flag.

Fit your own model
^^^^^^^^^^^^^^^^^^
If a database isn't available for your species, you can fit your own. This consists of three steps:
Expand Down
25 changes: 24 additions & 1 deletion docs/query_assignment.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,29 @@ Briefly, `download your reference database <https://www.bacpop.org/poppunk/>`__
poppunk_assign --db database --query qfile.txt \
--output poppunk_clusters --threads 8

Nomenclature
------------

PopPUNK clusters are numbered from one upwards, in decreasing order of size in the initial
dataset.

``poppunk_assign`` will assign your genomes into these existing clusters, with the same labels as the
initial run. So cluster labels, when used as documented, **do not change**.

In some cases, due to undersampling of the initial dataset or emergence
of hybrids, some clusters may be merged. These merged clusters will be named with
underscores separating the older clusters they were merges of. Use ``--external-clustering``
if you prefer other nicknames for these.

If you require 'stable nomenclature' where clusters never merge, use the ``--stable`` option
with ``poppunk_assign``. Each query will be assigned based on its nearest neighbour's cluster,
though novel clusters will still be separately identified as 'NA'.

Note that maintaining stable nomenclature in a dynamic population is not possible (for any
nomenclature). If you are maintaining a database and want to add new queries in, you will
need to use ``--update-db`` which may merge clusters. There is no way with two or more updates
of giving consistent new names to merged clusters.

Downloading a database
----------------------
Current PopPUNK databases can be found here: https://www.bacpop.org/poppunk/
Expand All @@ -18,7 +41,7 @@ as queries. The clusters assigned by PopPUNK are variable-length-k-mer clusters
A database called ``database`` will contain the following files, in ``database/``:

- ``database.h5`` -- the sketches of the reference sequences generated by ``pp-sketchlib``.
- ``database.dists.npy`` and ``database.dists.pkl`` -- the core and accessory distances for
- ``database.dists.pkl`` -- the order of the core and accessory distances for
all pairwise comparisons in the sketch database.
- ``database_fit.npz`` and ``database_fit.pkl`` -- the model fit to the core and accessory distances.
- ``database_graph.gt`` -- the network defining the fit (loadable with ``graph_tool``).
Expand Down
9 changes: 1 addition & 8 deletions docs/roadmap.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,9 @@ PopPUNK
-------
1. Containerise the workflow. See `#193 <https://github.com/bacpop/PopPUNK/issues/193>`__, `#277 <https://github.com/bacpop/PopPUNK/issues/277>`__, `#278 <https://github.com/bacpop/PopPUNK/issues/278>`__.
2. Add full worked tutorials back to the documentation `#275 <https://github.com/bacpop/PopPUNK/issues/275>`__.
3. Make the update pipeline more robust. See `#273 <https://github.com/bacpop/PopPUNK/issues/273>`__.
4. Codebase optimsation and refactoring
3. Codebase optimsation and refactoring
- Modularisation of the network code `#249 <https://github.com/bacpop/PopPUNK/issues/249>`__.
- Removing old functions `#103 <https://github.com/bacpop/PopPUNK/issues/103>`__
5. Add more species databases:
- N. meningitidis `#267 <https://github.com/bacpop/PopPUNK/issues/267>`__.
- H. influenzae `#276 <https://github.com/bacpop/PopPUNK/issues/276>`__.
6. Stable names for lineage/subclustering modes.

Other enhancements listed on the `issue page <https://github.com/bacpop/pp-sketchlib/issues>`__ are currently not planned.

Expand All @@ -27,8 +22,6 @@ pp-sketchlib

1. Update installation in package managers
- Update for new macOS `#92 <https://github.com/bacpop/ska.rust#planned-features>`__
- Rebuild conda recipe for CUDA12 and newer HDF5 `#46 <https://github.com/conda-forge/pp-sketchlib-feedstock/pull/46>`__
2. Allow amino-acids as input `#89 <https://github.com/bacpop/pp-sketchlib/issues/89>`__.

Other enhancements listed on the `issue page <https://github.com/bacpop/pp-sketchlib/issues>`__ are currently not planned.

Expand Down
14 changes: 12 additions & 2 deletions test/clean_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@ def deleteDir(dirname):
shutil.rmtree(dirname)

sys.stderr.write("Cleaning up tests\n")
dirty_files = ['example_db.info.csv']
dirty_files = [
"example_db.info.csv",
"example_external_clusters.csv",
"batch12_external_clusters.csv",
"example_lineage_scheme.pkl",
"lineage_creation_output.csv",
"lineage_querying_output.csv"
]
with open("references.txt", 'r') as ref_file:
for line in ref_file:
dirty_files.append(line.rstrip().split("\t")[1])
Expand All @@ -29,6 +36,7 @@ def deleteDir(dirname):
"example_query",
"example_single_query",
"example_query_update",
"example_query_update_2",
"example_lineage_query",
"example_viz",
"example_viz_subset",
Expand All @@ -46,8 +54,10 @@ def deleteDir(dirname):
"batch3",
"batch12",
"batch123",
"batch123_viz",
"strain_1_lineage_db",
"strain_2_lineage_db"
"strain_2_lineage_db",
"lineage_querying_output"
]
for outDir in outputDirs:
deleteDir(outDir)
Expand Down
5 changes: 5 additions & 0 deletions test/even_more_queries.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
12754_5_55 12754_5#55.contigs_velvet.fa
19183_4_61 19183_4#61.contigs_velvet.fa
12673_8_34 12673_8#34.contigs_velvet.fa
19183_4_70 19183_4#70.contigs_velvet.fa
12754_4_89 12754_4#89.contigs_velvet.fa
4 changes: 3 additions & 1 deletion test/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@
# assign query
sys.stderr.write("Running query assignment\n")
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --core --accessory", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --serial --overwrite --core --accessory", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --serial --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --core --accessory", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --stable core --query some_queries.txt --db example_db --model-dir example_refine --output example_query_stable --previous-clustering example_refine --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --length-range 2900000 3000000 --max-zero-dist 1 --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --max-pi-dist 0.04 --max-zero-dist 1 --betweenness --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query more_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --max-zero-dist 0.3 --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query more_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --max-zero-dist 1 --max-merge 3 --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query even_more_queries.txt --db example_query_update --model-dir example_dbscan --previous-clustering example_query_update --output example_query_update_2 --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite", shell=True, check=True)
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query inref_query.txt --db example_db --model-dir example_refine --output example_single_query --write-references", shell=True, check=True) # matched name, but should be renamed in the output
subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True)
Expand Down
Loading

0 comments on commit 81d436d

Please sign in to comment.