Merge pull request #320 from bacpop/sparse_dist_assign

PopPUNK 2.7.0 candidate
bacpop · Aug 9, 2024 · 81d436d · 81d436d
2 parents cdf1b2c + 173e65f
commit 81d436d
Show file tree

Hide file tree

Showing 15 changed files with 186 additions and 147 deletions.
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -3,7 +3,7 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.6.7'
+__version__ = '2.7.0'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 2

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py
@@ -7,6 +7,7 @@
 import argparse
 import subprocess
 import pickle
+import shutil
 import pandas as pd
 from collections import defaultdict
 
@@ -142,16 +143,18 @@ def main():
         create_db(args)
     elif args.query_db is not None:
         query_db(args)
-  
+
 
 def create_db(args):
 
     # Check if output files exist
     if not args.overwrite:
         if os.path.exists(args.output + '.csv'):
             sys.stderr.write('Output file ' + args.output + '.csv exists; use --overwrite to replace it\n')
+            sys.exit(1)
         if os.path.exists(args.db_scheme):
             sys.stderr.write('Output file ' + args.db_scheme + ' exists; use --overwrite to replace it\n')
+            sys.exit(1)
 
     sys.stderr.write("Identifying strains in existing database\n")
     # Read in strain information
@@ -197,7 +200,8 @@ def create_db(args):
       if num_isolates >= args.min_count:
         lineage_dbs[strain] = strain_db_name
         if os.path.isdir(strain_db_name) and args.overwrite:
-            os.rmdir(strain_db_name)
+            sys.stderr.write("--overwrite means {strain_db_name} will be deleted now\n")
+            shutil.rmtree(strain_db_name)
         if not os.path.isdir(strain_db_name):
             try:
                 os.makedirs(strain_db_name)
@@ -209,7 +213,8 @@ def create_db(args):
         dest_db = os.path.join(strain_db_name,os.path.basename(strain_db_name) + '.h5')
         rel_path = os.path.relpath(src_db, os.path.dirname(dest_db))
         if os.path.exists(dest_db) and args.overwrite:
-            os.remove(dest_db)
+            sys.stderr.write("--overwrite means {dest_db} will be deleted now\n")
+            shutil.rmtree(dest_db)
         elif not os.path.exists(dest_db):
             os.symlink(rel_path,dest_db)
         # Extract sparse distances
@@ -304,7 +309,7 @@ def create_db(args):
 
 
 def query_db(args):
-    
+
     # Read querying scheme
     with open(args.db_scheme, 'rb') as pickle_file:
         ref_db, rlist, model_dir, clustering_file, args.clustering_col_name, distances, \
@@ -374,6 +379,7 @@ def query_db(args):
                     False, # write references - need to consider whether to support ref-only databases for assignment
                     distances,
                     False, # serial - needs to be supported for web version?
+                    None, # stable - not supported here
                     args.threads,
                     True, # overwrite - probably OK?
                     False, # plot_fit - turn off for now
@@ -420,6 +426,7 @@ def query_db(args):
                             False, # write references - need to consider whether to support ref-only databases for assignment
                             lineage_distances,
                             False, # serial - needs to be supported for web version?
+                            None, # stable - not supported here
                             args.threads,
                             True, # overwrite - probably OK?
                             False, # plot_fit - turn off for now
@@ -434,10 +441,10 @@ def query_db(args):
                             args.gpu_graph,
                             save_partial_query_graph = False)
             overall_lineage[strain] = createOverallLineage(rank_list, lineageClustering)
-    
+
     # Print combined strain and lineage clustering
     print_overall_clustering(overall_lineage,args.output + '.csv',qNames)
-    
+
 
 def print_overall_clustering(overall_lineage,output,include_list):
 
@@ -455,7 +462,7 @@ def print_overall_clustering(overall_lineage,output,include_list):
                         isolate_info[isolate].append(str(overall_lineage[strain][rank][isolate]))
                     else:
                         isolate_info[isolate] = [str(strain),str(overall_lineage[strain][rank][isolate])]
-    
+
     # Print output
     with open(output,'w') as out:
         out.write('id,Cluster,')

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
@@ -1425,7 +1425,7 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
     if use_gpu:
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
-        newClusters = [set() for rank in range(component_frequencies.size)]
+        newClusters = [set() for _ in range(component_frequencies.size)]
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
             component = component_assignments['labels'].iloc[isolate_index].item()
             component_rank_bool = component_frequencies.index == component
@@ -1448,7 +1448,7 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
         oldClusters = oldAllClusters[list(oldAllClusters.keys())[0]]
         # parse all previously used clusters, including those that are merged
         parsed_oldClusters = set([int(item) for sublist in (x.split('_') for x in oldClusters) for item in sublist])
-        
+
         new_id = max(parsed_oldClusters) + 1 # 1-indexed
         while new_id in parsed_oldClusters:
             new_id += 1 # in case clusters have been merged

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
@@ -146,12 +146,15 @@ def storePickle(rlist, qlist, self, X, pklName):
             Whether an all-vs-all self DB (for :func:`~iterDistRows`)
         X (numpy.array)
             n x 2 array of core and accessory distances
+
+            If None, do not save
         pklName (str)
             Prefix for output files
     """
     with open(pklName + ".pkl", 'wb') as pickle_file:
         pickle.dump([rlist, qlist, self], pickle_file)
-    np.save(pklName + ".npy", X)
+    if isinstance(X, np.ndarray):
+        np.save(pklName + ".npy", X)
 
 
 def readPickle(pklName, enforce_self=False, distances=True):
@@ -266,7 +269,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
             File name of CSV with isolate assignments
         mode (str)
             Type of file to read 'clusters', 'lineages', or 'external'
-        return_type (str)
+        return_dict (bool)
             If True, return a dict with sample->cluster instead
             of sets
             [default = False]

diff --git a/README.md b/README.md
@@ -21,15 +21,24 @@ Lees JA, Harris SR, Tonkin-Hill G, Gladstone RA, Lo SW, Weiser JN, Corander J, B
 Fast and flexible bacterial genomic epidemiology with PopPUNK. *Genome Research* **29**:304-316 (2019).
 doi:[10.1101/gr.241455.118](https://doi.org/10.1101/gr.241455.118)
 
-You can also run your command with `--citation` to get a [list of citations](https://poppunk.readthedocs.io/en/latest/citing.html) and a
-suggested methods paragraph.
+You can also run your command with `--citation` to get a [list of citations](https://poppunk.readthedocs.io/en/latest/citing.html) and a suggested methods paragraph.
 
 ## News and roadmap
 
 The [roadmap](https://poppunk.bacpop.org/roadmap.html) can be found in the documentation.
 
-### 2023-01-18
+### 2024-08-07
+PopPUNK 2.7.0 comes with two changes:
+- Distance matrices `<db_name>.dists.npy` are no longer required or written when using
+`poppunk_assign`, with or without `--update-db`. These can be very large, especially
+with many samples, so this saves space and memory in model reuse and distribution. Note that
+the `<db_name>.dists.pkl` file is still required (but this is small).
+- We have added a `--stable` flag to `poppunk_assign`. Rather than merging hybrid clusters,
+new samples will simply be assigned to their nearest neighbour. This implies `--serial` and
+cannot be run with `--update-db`. This behaviour mimics the 'stable nomenclature' of schemes
+such as [LIN](https://doi.org/10.1093/molbev/msac135).
 
+### 2023-01-18
 We have retired the PopPUNK website. Databases have been expanded, and can be
 found here: https://www.bacpop.org/poppunk/.
 
@@ -45,11 +54,13 @@ change clusters).
 If this is a common problem let us know, as we could write a script to 'upgrade'
 HDBSCAN models.
 See issue [#213](https://github.com/bacpop/PopPUNK/issues/213) for more details.
+
 ### 2021-03-15
 We have fixed a number of bugs with may affect the use of `poppunk_assign` with
 `--update-db`. We have also fixed a number of bugs with GPU distances. These are
 'advanced' features and are not likely to be encountered in most cases, but if you do wish to use either of these features please make sure that you are using
 `PopPUNK >=v2.4.0` with `pp-sketchlib >=v1.7.0`.
+
 ### 2020-09-30
 We have discovered a bug affecting the interaction of pp-sketchlib and PopPUNK.
 If you have used `PopPUNK >=v2.0.0` with `pp-sketchlib <v1.5.1` label order may

diff --git a/docs/index.rst b/docs/index.rst
@@ -58,7 +58,7 @@ The advantages of PopPUNK are broadly that:
 - It is fast, and scalable to over :math:`10^{5}` genomes in a single run.
 - Assigning new query sequences to a cluster using an existing database is scalable even beyond this.
 - Cluster names remain consistent between studies, and other cluster labels such as MLST
-  can be appended.
+  can be appended. **Please note that when used as documented PopPUNK outputs stable nomenclature**.
 - Databases can be updated online (as sequences arrive).
 - Online updating is equivalent to building databases from scratch.
 - Databases can be kept small and managable by only keeping representative isolates.

diff --git a/docs/model_distribution.rst b/docs/model_distribution.rst
@@ -12,7 +12,7 @@ Database contents
 A database requires the following files:
 
 - ``.h5``. The sketch database, a HDF5 file.
-- ``.dists.pkl`` and ``.dists.npy`` files. Distances for all vs all samples in the sketch database.
+- ``.dists.pkl`` file. Order and names of samples in the sketch database.
 - ``_fit.npz`` and ``_fit.pkl`` files. Python files which describe the model fit.
 - ``_graph.gt``. The network relating distances, fit and strain assignment for all samples in the sketch database.
 - ``_clusters.csv``. The strain assignment of all samples in the sketch database.

diff --git a/docs/overview.rst b/docs/overview.rst
@@ -32,6 +32,9 @@ See :doc:`query_assignment` for instructions on how to use this mode.
 You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme
 to define your sample's strains.
 
+If you want to avoid any merged clusters (and get 'stable nomenclature') use the
+``--stable`` flag.
+
 Fit your own model
 ^^^^^^^^^^^^^^^^^^
 If a database isn't available for your species, you can fit your own. This consists of three steps:

diff --git a/docs/query_assignment.rst b/docs/query_assignment.rst
@@ -8,6 +8,29 @@ Briefly, `download your reference database <https://www.bacpop.org/poppunk/>`__
     poppunk_assign --db database --query qfile.txt \
     --output poppunk_clusters --threads 8
 
+Nomenclature
+------------
+
+PopPUNK clusters are numbered from one upwards, in decreasing order of size in the initial
+dataset.
+
+``poppunk_assign`` will assign your genomes into these existing clusters, with the same labels as the
+initial run. So cluster labels, when used as documented, **do not change**.
+
+In some cases, due to undersampling of the initial dataset or emergence
+of hybrids, some clusters may be merged. These merged clusters will be named with
+underscores separating the older clusters they were merges of. Use ``--external-clustering``
+if you prefer other nicknames for these.
+
+If you require 'stable nomenclature' where clusters never merge, use the ``--stable`` option
+with ``poppunk_assign``. Each query will be assigned based on its nearest neighbour's cluster,
+though novel clusters will still be separately identified as 'NA'.
+
+Note that maintaining stable nomenclature in a dynamic population is not possible (for any
+nomenclature). If you are maintaining a database and want to add new queries in, you will
+need to use ``--update-db`` which may merge clusters. There is no way with two or more updates
+of giving consistent new names to merged clusters.
+
 Downloading a database
 ----------------------
 Current PopPUNK databases can be found here: https://www.bacpop.org/poppunk/
@@ -18,7 +41,7 @@ as queries. The clusters assigned by PopPUNK are variable-length-k-mer clusters
 A database called ``database`` will contain the following files, in ``database/``:
 
 - ``database.h5`` -- the sketches of the reference sequences generated by ``pp-sketchlib``.
-- ``database.dists.npy`` and ``database.dists.pkl`` -- the core and accessory distances for
+- ``database.dists.pkl`` -- the order of the core and accessory distances for
   all pairwise comparisons in the sketch database.
 - ``database_fit.npz`` and ``database_fit.pkl`` -- the model fit to the core and accessory distances.
 - ``database_graph.gt`` -- the network defining the fit (loadable with ``graph_tool``).

diff --git a/docs/roadmap.rst b/docs/roadmap.rst
@@ -11,14 +11,9 @@ PopPUNK
 -------
 1. Containerise the workflow. See `#193 <https://github.com/bacpop/PopPUNK/issues/193>`__, `#277 <https://github.com/bacpop/PopPUNK/issues/277>`__, `#278 <https://github.com/bacpop/PopPUNK/issues/278>`__.
 2. Add full worked tutorials back to the documentation `#275 <https://github.com/bacpop/PopPUNK/issues/275>`__.
-3. Make the update pipeline more robust. See `#273 <https://github.com/bacpop/PopPUNK/issues/273>`__.
-4. Codebase optimsation and refactoring
+3. Codebase optimsation and refactoring
     - Modularisation of the network code `#249 <https://github.com/bacpop/PopPUNK/issues/249>`__.
     - Removing old functions `#103 <https://github.com/bacpop/PopPUNK/issues/103>`__
-5. Add more species databases:
-    - N. meningitidis `#267 <https://github.com/bacpop/PopPUNK/issues/267>`__.
-    - H. influenzae `#276 <https://github.com/bacpop/PopPUNK/issues/276>`__.
-6. Stable names for lineage/subclustering modes.
 
 Other enhancements listed on the `issue page <https://github.com/bacpop/pp-sketchlib/issues>`__ are currently not planned.
 
@@ -27,8 +22,6 @@ pp-sketchlib
 
 1. Update installation in package managers
     - Update for new macOS `#92 <https://github.com/bacpop/ska.rust#planned-features>`__
-    - Rebuild conda recipe for CUDA12 and newer HDF5 `#46 <https://github.com/conda-forge/pp-sketchlib-feedstock/pull/46>`__
-2. Allow amino-acids as input `#89 <https://github.com/bacpop/pp-sketchlib/issues/89>`__.
 
 Other enhancements listed on the `issue page <https://github.com/bacpop/pp-sketchlib/issues>`__ are currently not planned.
 

diff --git a/test/clean_test.py b/test/clean_test.py
@@ -12,7 +12,14 @@ def deleteDir(dirname):
         shutil.rmtree(dirname)
 
 sys.stderr.write("Cleaning up tests\n")
-dirty_files = ['example_db.info.csv']
+dirty_files = [
+    "example_db.info.csv",
+    "example_external_clusters.csv",
+    "batch12_external_clusters.csv",
+    "example_lineage_scheme.pkl",
+    "lineage_creation_output.csv",
+    "lineage_querying_output.csv"
+]
 with open("references.txt", 'r') as ref_file:
     for line in ref_file:
         dirty_files.append(line.rstrip().split("\t")[1])
@@ -29,6 +36,7 @@ def deleteDir(dirname):
     "example_query",
     "example_single_query",
     "example_query_update",
+    "example_query_update_2",
     "example_lineage_query",
     "example_viz",
     "example_viz_subset",
@@ -46,8 +54,10 @@ def deleteDir(dirname):
     "batch3",
     "batch12",
     "batch123",
+    "batch123_viz",
     "strain_1_lineage_db",
-    "strain_2_lineage_db"
+    "strain_2_lineage_db",
+    "lineage_querying_output"
 ]
 for outDir in outputDirs:
     deleteDir(outDir)

diff --git a/test/even_more_queries.txt b/test/even_more_queries.txt
@@ -0,0 +1,5 @@
+12754_5_55	12754_5#55.contigs_velvet.fa
+19183_4_61	19183_4#61.contigs_velvet.fa
+12673_8_34	12673_8#34.contigs_velvet.fa
+19183_4_70	19183_4#70.contigs_velvet.fa
+12754_4_89	12754_4#89.contigs_velvet.fa
diff --git a/test/run_test.py b/test/run_test.py
@@ -64,12 +64,14 @@
 # assign query
 sys.stderr.write("Running query assignment\n")
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --core --accessory", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --serial --overwrite --core --accessory", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --serial --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --core --accessory", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --stable core --query some_queries.txt --db example_db --model-dir example_refine --output example_query_stable --previous-clustering example_refine --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --length-range 2900000 3000000 --max-zero-dist 1 --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --max-pi-dist 0.04 --max-zero-dist 1 --betweenness --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query more_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --max-zero-dist 0.3 --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query more_queries.txt --db example_db --model-dir example_refine --output example_query --run-qc --max-zero-dist 1 --max-merge 3 --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query even_more_queries.txt --db example_query_update --model-dir example_dbscan --previous-clustering example_query_update --output example_query_update_2 --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query inref_query.txt --db example_db --model-dir example_refine --output example_single_query --write-references", shell=True, check=True) # matched name, but should be renamed in the output
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True)