Resolve conflicts with master

bacpop · Feb 29, 2024 · 025a818 · 025a818
2 parents 5652532 + 27e7f85
commit 025a818
Show file tree

Hide file tree

Showing 20 changed files with 478 additions and 146 deletions.
diff --git a/.github/workflows/docs_pr.yml b/.github/workflows/docs_pr.yml
@@ -0,0 +1,20 @@
+name: "Pull Request Docs Check"
+
+on: [pull_request]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+
+    - uses: ammaraskar/sphinx-action@master
+      with:
+        docs-folder: "docs/"
+
+    - uses: actions/upload-artifact@v1
+      with:
+        name: DocumentationHTML
+        path: docs/_build/html/
diff --git a/.github/workflows/docs_push.yml b/.github/workflows/docs_push.yml
@@ -0,0 +1,42 @@
+name: "Build and publish docs"
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+
+    - uses: ammaraskar/sphinx-action@master
+      with:
+        docs-folder: "docs/"
+
+    - uses: actions/upload-pages-artifact@v2
+      with:
+        name: github-pages
+        path: docs/_build/html/
+        retention-days: 1
+
+  deploy:
+    permissions:
+      contents: read
+      pages: write
+      id-token: write
+
+    needs: build
+
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    runs-on: ubuntu-latest
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v3
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -5,6 +5,12 @@
 # Required
 version: 2
 
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/conf.py
@@ -14,8 +20,7 @@ formats: all
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-   version: 3.8
    install:
       - requirements: docs/requirements.txt
       - method: setuptools
-        path: docs
+        path: docs
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -3,7 +3,7 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.6.1'
+__version__ = '2.6.4'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 2

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -114,11 +114,34 @@ def get_options():
 
     # model fitting
     modelGroup = parser.add_argument_group('Model fit options')
-    modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
-    modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
-    modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
-                                                        'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
-    modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
+    modelGroup.add_argument('--model-subsample',
+                            help='Number of pairwise distances used to fit model [default = 100000]',
+                            type=int,
+                            default=100000)
+    modelGroup.add_argument('--assign-subsample',
+                            help='Number of pairwise distances in each assignment batch [default = 5000]',
+                            type=int,
+                            default=5000)
+    modelGroup.add_argument('--for-refine',
+                            help='Fit a BGMM or DBSCAN model without assigning all points to initialise a refined model',
+                            default=False,
+                            action='store_true')
+    modelGroup.add_argument('--K',
+                            help='Maximum number of mixture components [default = 2]',
+                            type=int,
+                            default=2)
+    modelGroup.add_argument('--D',
+                            help='Maximum number of clusters in DBSCAN fitting [default = 100]',
+                            type=int,
+                            default=100)
+    modelGroup.add_argument('--min-cluster-prop',
+                            help='Minimum proportion of points in a cluster '
+                                 'in DBSCAN fitting [default = 0.0001]',
+                            type=float,
+                            default=0.0001)
+    modelGroup.add_argument('--threshold',
+                            help='Cutoff if using --fit-model threshold',
+                            type=float)
 
     # model refinement
     refinementGroup = parser.add_argument_group('Network analysis and model refinement options')
@@ -183,6 +206,7 @@ def get_options():
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
     other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
+    other.add_argument('--gpu-model', default=False, action='store_true', help='Use a GPU when fitting a model [default = False]')
     other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
     other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets',
@@ -501,14 +525,24 @@ def main():
         if args.fit_model:
             # Run DBSCAN model
             if args.fit_model == "dbscan":
-                model = DBSCANFit(output)
+                model = DBSCANFit(output,
+                                  max_samples = args.model_subsample,
+                                  max_batch_size = args.assign_subsample,
+                                  assign_points = not args.for_refine)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.D, args.min_cluster_prop)
+                assignments = model.fit(distMat,
+                                        args.D,
+                                        args.min_cluster_prop,
+                                        args.gpu_model)
             # Run Gaussian model
             elif args.fit_model == "bgmm":
-                model = BGMMFit(output)
+                model = BGMMFit(output,
+                                max_samples = args.model_subsample,
+                                max_batch_size = args.assign_subsample,
+                                assign_points = not args.for_refine)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.K)
+                assignments = model.fit(distMat,
+                                        args.K)
             elif args.fit_model == "refine":
                 new_model = RefineFit(output)
                 new_model.set_threads(args.threads)
@@ -568,6 +602,11 @@ def main():
         else:
             assignments = model.assign(distMat)
 
+        # end here if not assigning data
+        if args.for_refine:
+            sys.stderr.write('Initial model fit complete; points will be assigned when this model is refined\nusing "--fit-model refine"\n')
+            sys.exit(0)
+
         #******************************#
         #*                            *#
         #* network construction       *#

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -408,6 +408,11 @@ def assign_query_hdf5(dbFuncs,
         raise RuntimeError("lineage models cannot be used with --serial")
     model.set_threads(threads)
 
+    # Only proceed with a fully-fitted model
+    if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False):
+        sys.stderr.write('Cannot assign points with an incompletely-fitted model\nPlease refine this initial fit with "--fit-model refine"\n')
+        sys.exit(1)
+
     # Set directories of previous fit
     if previous_clustering is not None:
         prev_clustering = previous_clustering
@@ -753,7 +758,7 @@ def assign_query_hdf5(dbFuncs,
                     postpruning_combined_seq, newDistMat = \
                         prune_distance_matrix(combined_seq, names_to_remove, complete_distMat,
                                               output + "/" + os.path.basename(output) + dists_suffix)
-                    graph_suffix = file_extension_string + '_refs_graph'
+                    graph_suffix = file_extension_string + '.refs_graph'
                     save_network(genomeNetwork,
                                     prefix = output,
                                     suffix = graph_suffix,

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
@@ -9,7 +9,9 @@
 # hdbscan
 import hdbscan
 
-def fitDbScan(X, min_samples, min_cluster_size, cache_out):
+from .utils import check_and_set_gpu
+
+def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
     """Function to fit DBSCAN model as an alternative to the Gaussian
 
     Fits the DBSCAN model to the distances using hdbscan
@@ -23,26 +25,42 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out):
             Minimum number of points in a cluster for HDBSCAN
         cache_out (str)
             Prefix for DBSCAN cache used for refitting
+        use_gpu (bool)
+            Whether GPU algorithms should be used in DBSCAN fitting
 
     Returns:
-        hdb (hdbscan.HDBSCAN)
+        hdb (hdbscan.HDBSCAN or cuml.cluster.HDBSCAN)
             Fitted HDBSCAN to subsampled data
         labels (list)
             Cluster assignments of each sample
         n_clusters (int)
             Number of clusters used
     """
     # set DBSCAN clustering parameters
-    hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
-                     min_samples = min_samples,
-                     #core_dist_n_jobs = threads, # may cause error, see #19
-                     memory = cache_out,
-                     prediction_data = True,
-                     min_cluster_size = min_cluster_size
-                     ).fit(X)
-    # Number of clusters in labels, ignoring noise if present.
-    labels = hdb.labels_
-    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+    if use_gpu:
+      from cuml import cluster
+      import cupy as cp
+      sys.stderr.write('Fitting HDBSCAN model using a GPU\n')
+      hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples,
+                                 output_type = 'cupy',
+                                 prediction_data = True,
+                                 min_cluster_size = min_cluster_size
+                                 ).fit(X)
+      # Number of clusters in labels, ignoring noise if present.
+      labels = hdb.labels_
+      n_clusters = len(cp.unique(labels[labels>-1]))
+    else:
+      sys.stderr.write('Fitting HDBSCAN model using a CPU\n')
+      hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
+                       min_samples = min_samples,
+                       #core_dist_n_jobs = threads, # may cause error, see #19
+                       memory = cache_out,
+                       prediction_data = True,
+                       min_cluster_size = min_cluster_size
+                       ).fit(X)
+      # Number of clusters in labels, ignoring noise if present.
+      labels = hdb.labels_
+      n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
 
     # return model parameters
     return hdb, labels, n_clusters
@@ -70,7 +88,7 @@ def evaluate_dbscan_clusters(model):
 
     # evaluate whether maxima of cluster nearest origin do not
     # overlap with minima of cluster furthest from origin
-    if core_minimum_of_between > core_maximum_of_within and \
+    if core_minimum_of_between > core_maximum_of_within or \
         accessory_minimum_of_between > accessory_maximum_of_within:
         indistinct = False