From ea9229bcf7dff506fe2843a67f27f646aa9da583 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Jan 2024 14:26:20 +0000
Subject: [PATCH 01/62] Allow changes to model fitting subsample

---
 PopPUNK/__main__.py | 6 ++++--
 PopPUNK/models.py   | 4 +---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 2fe144dc..ca64fbec 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -114,6 +114,8 @@ def get_options():
 
     # model fitting
     modelGroup = parser.add_argument_group('Model fit options')
+    modelGroup.add_argument('--model-subsample', help='Number of pairwise distances used to fit model [default = 100000]',
+                            type=int, default=100000)
     modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
     modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
     modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
@@ -491,12 +493,12 @@ def main():
         if args.fit_model:
             # Run DBSCAN model
             if args.fit_model == "dbscan":
-                model = DBSCANFit(output)
+                model = DBSCANFit(output, max_samples = args.model_subsample)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.D, args.min_cluster_prop)
             # Run Gaussian model
             elif args.fit_model == "bgmm":
-                model = BGMMFit(output)
+                model = BGMMFit(output, max_samples = args.model_subsample)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.K)
             elif args.fit_model == "refine":
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index d51f83f4..44262f1b 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -289,11 +289,10 @@ class BGMMFit(ClusterFit):
             The output prefix used for reading/writing
         max_samples (int)
             The number of subsamples to fit the model to
-
             (default = 100000)
     '''
 
-    def __init__(self, outPrefix, max_samples = 50000):
+    def __init__(self, outPrefix, max_samples = 100000):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'bgmm'
         self.preprocess = True
@@ -469,7 +468,6 @@ class DBSCANFit(ClusterFit):
             The output prefix used for reading/writing
         max_samples (int)
             The number of subsamples to fit the model to
-
             (default = 100000)
     '''
 

From a4d98af7d13771fcf79c6ef7da7b58aca4778af1 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Jan 2024 15:34:53 +0000
Subject: [PATCH 02/62] Add GPU HDBSCAN algorithm

---
 PopPUNK/__main__.py |  3 ++-
 PopPUNK/dbscan.py   | 40 ++++++++++++++++++++++++++++++++--------
 PopPUNK/models.py   | 10 ++++++++--
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index ca64fbec..75b7884c 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -182,6 +182,7 @@ def get_options():
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
     other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
+    other.add_argument('--gpu-model', default=False, action='store_true', help='Use a GPU when fitting a model [default = False]')
     other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
     other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets',
@@ -495,7 +496,7 @@ def main():
             if args.fit_model == "dbscan":
                 model = DBSCANFit(output, max_samples = args.model_subsample)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.D, args.min_cluster_prop)
+                assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model)
             # Run Gaussian model
             elif args.fit_model == "bgmm":
                 model = BGMMFit(output, max_samples = args.model_subsample)
diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
index 2ab580c1..a85e19e4 100644
--- a/PopPUNK/dbscan.py
+++ b/PopPUNK/dbscan.py
@@ -9,7 +9,9 @@
 # hdbscan
 import hdbscan
 
-def fitDbScan(X, min_samples, min_cluster_size, cache_out):
+from .utils import check_and_set_gpu
+
+def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
     """Function to fit DBSCAN model as an alternative to the Gaussian
 
     Fits the DBSCAN model to the distances using hdbscan
@@ -23,6 +25,8 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out):
             Minimum number of points in a cluster for HDBSCAN
         cache_out (str)
             Prefix for DBSCAN cache used for refitting
+        use_gpu (bool)
+            Whether GPU algorithms should be used in DBSCAN fitting
 
     Returns:
         hdb (hdbscan.HDBSCAN)
@@ -32,14 +36,34 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out):
         n_clusters (int)
             Number of clusters used
     """
+    # Check on initialisation of GPU libraries and memory
+    try:
+        import cudf
+        from cuml import cluster
+        gpu_lib = True
+    except ImportError as e:
+        gpu_lib = False
+    # check on GPU
+    use_gpu = check_and_set_gpu(use_gpu,
+                                gpu_lib,
+                                quit_on_fail = True)
     # set DBSCAN clustering parameters
-    hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
-                     min_samples = min_samples,
-                     #core_dist_n_jobs = threads, # may cause error, see #19
-                     memory = cache_out,
-                     prediction_data = True,
-                     min_cluster_size = min_cluster_size
-                     ).fit(X)
+    if use_gpu:
+      sys.stderr.write('Fitting HDBSCAN model using a GPU')
+      hdb = cluster.HDBSCAN(min_samples = min_samples,
+                                 #core_dist_n_jobs = threads, # may cause error, see #19
+                                 prediction_data = True,
+                                 min_cluster_size = min_cluster_size
+                                 ).fit(X)
+    else:
+      sys.stderr.write('Fitting HDBSCAN model using a CPU')
+      hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
+                       min_samples = min_samples,
+                       #core_dist_n_jobs = threads, # may cause error, see #19
+                       memory = cache_out,
+                       prediction_data = True,
+                       min_cluster_size = min_cluster_size
+                       ).fit(X)
     # Number of clusters in labels, ignoring noise if present.
     labels = hdb.labels_
     n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 44262f1b..67114ea7 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -478,7 +478,7 @@ def __init__(self, outPrefix, max_samples = 100000):
         self.max_samples = max_samples
 
 
-    def fit(self, X, max_num_clusters, min_cluster_prop):
+    def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         '''Extends :func:`~ClusterFit.fit`
 
         Fits the distances with HDBSCAN and returns assignments by calling
@@ -494,6 +494,8 @@ def fit(self, X, max_num_clusters, min_cluster_prop):
                 Maximum number of clusters in DBSCAN fitting
             min_cluster_prop (float)
                 Minimum proportion of points in a cluster in DBSCAN fitting
+            use_gpu (bool)
+                Whether GPU algorithms should be used in DBSCAN fitting
 
         Returns:
             y (numpy.array)
@@ -508,7 +510,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop):
 
         indistinct_clustering = True
         while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10:
-            self.hdb, self.labels, self.n_clusters = fitDbScan(self.subsampled_X, min_samples, min_cluster_size, cache_out)
+            self.hdb, self.labels, self.n_clusters = fitDbScan(self.subsampled_X,
+                                                                min_samples,
+                                                                min_cluster_size,
+                                                                cache_out,
+                                                                use_gpu = use_gpu)
             self.fitted = True # needed for predict
 
             # Test whether model fit contains distinct clusters

From d30297410b7a9c2d5121fd35a797933dfcd6f5ae Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 13:42:46 +0000
Subject: [PATCH 03/62] Update GPU memory management

---
 PopPUNK/dbscan.py | 11 ++++++-----
 PopPUNK/utils.py  |  6 ++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
index a85e19e4..2f8b3d81 100644
--- a/PopPUNK/dbscan.py
+++ b/PopPUNK/dbscan.py
@@ -29,7 +29,7 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
             Whether GPU algorithms should be used in DBSCAN fitting
 
     Returns:
-        hdb (hdbscan.HDBSCAN)
+        hdb (hdbscan.HDBSCAN or cuml.cluster.HDBSCAN)
             Fitted HDBSCAN to subsampled data
         labels (list)
             Cluster assignments of each sample
@@ -40,6 +40,7 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
     try:
         import cudf
         from cuml import cluster
+        import cupy
         gpu_lib = True
     except ImportError as e:
         gpu_lib = False
@@ -49,14 +50,14 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
                                 quit_on_fail = True)
     # set DBSCAN clustering parameters
     if use_gpu:
-      sys.stderr.write('Fitting HDBSCAN model using a GPU')
-      hdb = cluster.HDBSCAN(min_samples = min_samples,
-                                 #core_dist_n_jobs = threads, # may cause error, see #19
+      sys.stderr.write('Fitting HDBSCAN model using a GPU\n')
+      hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples,
+                                 output_type = 'cupy',
                                  prediction_data = True,
                                  min_cluster_size = min_cluster_size
                                  ).fit(X)
     else:
-      sys.stderr.write('Fitting HDBSCAN model using a CPU')
+      sys.stderr.write('Fitting HDBSCAN model using a CPU\n')
       hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
                        min_samples = min_samples,
                        #core_dist_n_jobs = threads, # may cause error, see #19
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 647d0203..ebc1948e 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -8,6 +8,7 @@
 import sys
 # additional
 import pickle
+import multiprocessing
 from collections import defaultdict
 from itertools import chain
 from tempfile import mkstemp
@@ -572,11 +573,12 @@ def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False):
 
     # Set memory management for large networks
     if use_gpu:
+        multiprocessing.set_start_method('spawn', force=True)
         rmm.reinitialize(managed_memory=True)
         if "cupy" in sys.modules:
-            cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
+            cupy.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator)
         if "cuda" in sys.modules:
-            cuda.set_memory_manager(rmm.RMMNumbaManager)
+            cuda.set_memory_manager(rmm.allocators.numba.RMMNumbaManager)
         assert(rmm.is_initialized())
 
     return use_gpu

From de8b76ff9906056d3a05116220d6c9b0eaa35334 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 13:49:21 +0000
Subject: [PATCH 04/62] Update min samples limit

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 67114ea7..09eea472 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -505,7 +505,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
 
         # DBSCAN parameters
         cache_out = "./" + self.outPrefix + "_cache"
-        min_samples = max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10)
+        min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023)
         min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10)
 
         indistinct_clustering = True

From edce43141b358544affa1f9a21a753ba38f6b2b8 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 14:19:31 +0000
Subject: [PATCH 05/62] Enable GPU model assignments

---
 PopPUNK/models.py | 120 ++++++++++++++++++++++++++++------------------
 1 file changed, 73 insertions(+), 47 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 09eea472..5efab492 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -39,6 +39,7 @@
     import cupy as cp
     from numba import cuda
     import rmm
+    from cuml import cluster
 except ImportError:
     pass
 
@@ -519,22 +520,42 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
 
             # Test whether model fit contains distinct clusters
             if self.n_clusters > 1 and self.n_clusters <= max_num_clusters:
-                # get within strain cluster
-                self.max_cluster_num = self.labels.max()
-                self.cluster_means = np.full((self.n_clusters,2),0.0,dtype=float)
-                self.cluster_mins = np.full((self.n_clusters,2),0.0,dtype=float)
-                self.cluster_maxs = np.full((self.n_clusters,2),0.0,dtype=float)
-
-                for i in range(self.max_cluster_num+1):
-                    self.cluster_means[i,] = [np.mean(self.subsampled_X[self.labels==i,0]),np.mean(self.subsampled_X[self.labels==i,1])]
-                    self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])]
-                    self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])]
-
-                y = self.assign(self.subsampled_X, no_scale=True, progress=False)
-                self.within_label = findWithinLabel(self.cluster_means, y)
-                self.between_label = findBetweenLabel(y, self.within_label)
-
-                indistinct_clustering = evaluate_dbscan_clusters(self)
+            
+              if use_gpu:
+                  # get within strain cluster
+                  self.max_cluster_num = self.labels.max()
+                  self.cluster_means = cupy.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_mins = cupy.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_maxs = cupy.full((self.n_clusters,2),0.0,dtype=float)
+
+                  for i in range(self.max_cluster_num+1):
+                      self.cluster_means[i,] = [cupy.mean(self.subsampled_X[self.labels==i,0]),cupy.mean(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_mins[i,] = [cupy.min(self.subsampled_X[self.labels==i,0]),cupy.min(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_maxs[i,] = [cupy.max(self.subsampled_X[self.labels==i,0]),cupy.max(self.subsampled_X[self.labels==i,1])]
+
+                  y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
+                  self.within_label = findWithinLabel(self.cluster_means, y)
+                  self.between_label = findBetweenLabel(y, self.within_label)
+
+                  indistinct_clustering = evaluate_dbscan_clusters(self)
+                  
+              else:
+                  # get within strain cluster
+                  self.max_cluster_num = self.labels.max()
+                  self.cluster_means = np.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_mins = np.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_maxs = np.full((self.n_clusters,2),0.0,dtype=float)
+
+                  for i in range(self.max_cluster_num+1):
+                      self.cluster_means[i,] = [np.mean(self.subsampled_X[self.labels==i,0]),np.mean(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])]
+
+                  y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = False)
+                  self.within_label = findWithinLabel(self.cluster_means, y)
+                  self.between_label = findBetweenLabel(y, self.within_label)
+
+                  indistinct_clustering = evaluate_dbscan_clusters(self)
 
             # Alter minimum cluster size criterion
             if min_cluster_size < min_samples / 2:
@@ -624,7 +645,7 @@ def plot(self, X=None, y=None):
                             self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan")
 
 
-    def assign(self, X, no_scale = False, progress = True):
+    def assign(self, X, no_scale = False, progress = True, use_gpu = False):
         '''Assign the clustering of new samples using :func:`~PopPUNK.dbscan.assign_samples_dbscan`
 
         Args:
@@ -632,12 +653,13 @@ def assign(self, X, no_scale = False, progress = True):
                 Core and accessory distances
             no_scale (bool)
                 Do not scale X
-
                 [default = False]
             progress (bool)
                 Show progress bar
-
                 [default = True]
+            use_gpu (bool)
+                Use GPU-enabled algorithms for clustering
+                [default = False]
         Returns:
             y (numpy.array)
                 Cluster assignments by samples
@@ -652,34 +674,38 @@ def assign(self, X, no_scale = False, progress = True):
             if progress:
                 sys.stderr.write("Assigning distances with DBSCAN model\n")
 
-            y = np.zeros(X.shape[0], dtype=int)
-            block_size = 5000
-            n_blocks = (X.shape[0] - 1) // block_size + 1
-            with SharedMemoryManager() as smm:
-                shm_X = smm.SharedMemory(size = X.nbytes)
-                X_shared_array = np.ndarray(X.shape, dtype = X.dtype, buffer = shm_X.buf)
-                X_shared_array[:] = X[:]
-                X_shared = NumpyShared(name = shm_X.name, shape = X.shape, dtype = X.dtype)
-
-                shm_y = smm.SharedMemory(size = y.nbytes)
-                y_shared_array = np.ndarray(y.shape, dtype = y.dtype, buffer = shm_y.buf)
-                y_shared_array[:] = y[:]
-                y_shared = NumpyShared(name = shm_y.name, shape = y.shape, dtype = y.dtype)
-
-                tqdm.set_lock(RLock())
-                process_map(partial(assign_samples,
-                            X = X_shared,
-                            y = y_shared,
-                            model = self,
-                            scale = scale,
-                            chunk_size = block_size,
-                            values = False),
-                    range(n_blocks),
-                    max_workers=self.threads,
-                    chunksize=min(10, max(1, n_blocks // self.threads)),
-                    disable=(progress == False))
-
-                y[:] = y_shared_array[:]
+            if use_gpu:
+              X_assignments = cluster.hdbscan.approximate_predict(self.hdb, X)
+              y = X_assignments.labels
+            else:
+              y = np.zeros(X.shape[0], dtype=int)
+              block_size = 5000
+              n_blocks = (X.shape[0] - 1) // block_size + 1
+              with SharedMemoryManager() as smm:
+                  shm_X = smm.SharedMemory(size = X.nbytes)
+                  X_shared_array = np.ndarray(X.shape, dtype = X.dtype, buffer = shm_X.buf)
+                  X_shared_array[:] = X[:]
+                  X_shared = NumpyShared(name = shm_X.name, shape = X.shape, dtype = X.dtype)
+
+                  shm_y = smm.SharedMemory(size = y.nbytes)
+                  y_shared_array = np.ndarray(y.shape, dtype = y.dtype, buffer = shm_y.buf)
+                  y_shared_array[:] = y[:]
+                  y_shared = NumpyShared(name = shm_y.name, shape = y.shape, dtype = y.dtype)
+
+                  tqdm.set_lock(RLock())
+                  process_map(partial(assign_samples,
+                              X = X_shared,
+                              y = y_shared,
+                              model = self,
+                              scale = scale,
+                              chunk_size = block_size,
+                              values = False),
+                      range(n_blocks),
+                      max_workers=self.threads,
+                      chunksize=min(10, max(1, n_blocks // self.threads)),
+                      disable=(progress == False))
+
+                  y[:] = y_shared_array[:]
 
         return y
 

From ab859c3664d64ccdf49f265edd96518bdee012d4 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 14:29:24 +0000
Subject: [PATCH 06/62] Cluster counting with cupy

---
 PopPUNK/dbscan.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
index 2f8b3d81..4c7adeed 100644
--- a/PopPUNK/dbscan.py
+++ b/PopPUNK/dbscan.py
@@ -40,7 +40,7 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
     try:
         import cudf
         from cuml import cluster
-        import cupy
+        import cupy as cp
         gpu_lib = True
     except ImportError as e:
         gpu_lib = False
@@ -56,6 +56,9 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
                                  prediction_data = True,
                                  min_cluster_size = min_cluster_size
                                  ).fit(X)
+      # Number of clusters in labels, ignoring noise if present.
+      labels = hdb.labels_
+      n_clusters = len(cp.unique(labels[labels>-1]))
     else:
       sys.stderr.write('Fitting HDBSCAN model using a CPU\n')
       hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
@@ -65,9 +68,9 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
                        prediction_data = True,
                        min_cluster_size = min_cluster_size
                        ).fit(X)
-    # Number of clusters in labels, ignoring noise if present.
-    labels = hdb.labels_
-    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+      # Number of clusters in labels, ignoring noise if present.
+      labels = hdb.labels_
+      n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
 
     # return model parameters
     return hdb, labels, n_clusters

From d314fd8b01217f5f5f3634e8129cff4255efb951 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 14:34:17 +0000
Subject: [PATCH 07/62] Fix cupy abbreviation

---
 PopPUNK/models.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 5efab492..6757b598 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -524,14 +524,14 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
               if use_gpu:
                   # get within strain cluster
                   self.max_cluster_num = self.labels.max()
-                  self.cluster_means = cupy.full((self.n_clusters,2),0.0,dtype=float)
-                  self.cluster_mins = cupy.full((self.n_clusters,2),0.0,dtype=float)
-                  self.cluster_maxs = cupy.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float)
+                  self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float)
 
                   for i in range(self.max_cluster_num+1):
-                      self.cluster_means[i,] = [cupy.mean(self.subsampled_X[self.labels==i,0]),cupy.mean(self.subsampled_X[self.labels==i,1])]
-                      self.cluster_mins[i,] = [cupy.min(self.subsampled_X[self.labels==i,0]),cupy.min(self.subsampled_X[self.labels==i,1])]
-                      self.cluster_maxs[i,] = [cupy.max(self.subsampled_X[self.labels==i,0]),cupy.max(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_means[i,] = [cp.mean(self.subsampled_X[self.labels==i,0]),cp.mean(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_mins[i,] = [cp.min(self.subsampled_X[self.labels==i,0]),cp.min(self.subsampled_X[self.labels==i,1])]
+                      self.cluster_maxs[i,] = [cp.max(self.subsampled_X[self.labels==i,0]),cp.max(self.subsampled_X[self.labels==i,1])]
 
                   y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
                   self.within_label = findWithinLabel(self.cluster_means, y)

From 36856779963e6940fc40af01a83f884c6924161b Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 16:13:39 +0000
Subject: [PATCH 08/62] Get max from cupy array

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 6757b598..98a9fbd5 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -523,7 +523,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
             
               if use_gpu:
                   # get within strain cluster
-                  self.max_cluster_num = self.labels.max()
+                  self.max_cluster_num = int(self.labels.amax()[0])
                   self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float)
                   self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float)
                   self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float)

From 6c031eb0c802875d9686e0b7f6b50752ff8a3e14 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 16:18:05 +0000
Subject: [PATCH 09/62] Change max function

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 98a9fbd5..b7b91db1 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -523,7 +523,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
             
               if use_gpu:
                   # get within strain cluster
-                  self.max_cluster_num = int(self.labels.amax()[0])
+                  self.max_cluster_num = int(self.labels.max()[0])
                   self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float)
                   self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float)
                   self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float)

From 7b7d30b19b484ac6a34fca049c02868b23b93aa6 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 16:22:52 +0000
Subject: [PATCH 10/62] Change cast to int

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index b7b91db1..4f0b2c52 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -523,7 +523,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
             
               if use_gpu:
                   # get within strain cluster
-                  self.max_cluster_num = int(self.labels.max()[0])
+                  self.max_cluster_num = int(self.labels.max())
                   self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float)
                   self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float)
                   self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float)

From aa8d68669df474027e780c0ae35ac2212e540b8e Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 16:35:34 +0000
Subject: [PATCH 11/62] Convert data to cupy

---
 PopPUNK/models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 4f0b2c52..ae86ff8a 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -509,8 +509,13 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023)
         min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10)
 
+        # Convert to cupy if using GPU to avoid implicit numpy conversion below
+        if use_gpu:
+            self.subsampled_X = cp.asarray(self.subsampled_X.shape)
+
         indistinct_clustering = True
         while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10:
+            # Fit model
             self.hdb, self.labels, self.n_clusters = fitDbScan(self.subsampled_X,
                                                                 min_samples,
                                                                 min_cluster_size,

From 54b5ab07862651c590a8f9b4cab1d96fbb086a97 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 20:33:00 +0000
Subject: [PATCH 12/62] Relax constraint on DBSCAN fits

---
 PopPUNK/dbscan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
index 4c7adeed..5cd582e2 100644
--- a/PopPUNK/dbscan.py
+++ b/PopPUNK/dbscan.py
@@ -98,7 +98,7 @@ def evaluate_dbscan_clusters(model):
 
     # evaluate whether maxima of cluster nearest origin do not
     # overlap with minima of cluster furthest from origin
-    if core_minimum_of_between > core_maximum_of_within and \
+    if core_minimum_of_between > core_maximum_of_within or \
         accessory_minimum_of_between > accessory_maximum_of_within:
         indistinct = False
 

From 78336b7f5479d454691d702343b5e4aed0c12471 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Jan 2024 20:33:10 +0000
Subject: [PATCH 13/62] Simplify fitting code

---
 PopPUNK/models.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index ae86ff8a..e7dadc83 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -539,10 +539,6 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                       self.cluster_maxs[i,] = [cp.max(self.subsampled_X[self.labels==i,0]),cp.max(self.subsampled_X[self.labels==i,1])]
 
                   y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
-                  self.within_label = findWithinLabel(self.cluster_means, y)
-                  self.between_label = findBetweenLabel(y, self.within_label)
-
-                  indistinct_clustering = evaluate_dbscan_clusters(self)
                   
               else:
                   # get within strain cluster
@@ -557,10 +553,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                       self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])]
 
                   y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = False)
-                  self.within_label = findWithinLabel(self.cluster_means, y)
-                  self.between_label = findBetweenLabel(y, self.within_label)
-
-                  indistinct_clustering = evaluate_dbscan_clusters(self)
+              
+              # Evaluate clustering
+              self.within_label = findWithinLabel(self.cluster_means, y)
+              self.between_label = findBetweenLabel(y, self.within_label)
+              indistinct_clustering = evaluate_dbscan_clusters(self)
 
             # Alter minimum cluster size criterion
             if min_cluster_size < min_samples / 2:

From 26e951103e617f4031e6e08ddbfda9482e789634 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 10:26:43 +0000
Subject: [PATCH 14/62] Manage GPU memory

---
 PopPUNK/dbscan.py | 14 ++------------
 PopPUNK/models.py | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
index 5cd582e2..4b073247 100644
--- a/PopPUNK/dbscan.py
+++ b/PopPUNK/dbscan.py
@@ -36,20 +36,10 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
         n_clusters (int)
             Number of clusters used
     """
-    # Check on initialisation of GPU libraries and memory
-    try:
-        import cudf
-        from cuml import cluster
-        import cupy as cp
-        gpu_lib = True
-    except ImportError as e:
-        gpu_lib = False
-    # check on GPU
-    use_gpu = check_and_set_gpu(use_gpu,
-                                gpu_lib,
-                                quit_on_fail = True)
     # set DBSCAN clustering parameters
     if use_gpu:
+      from cuml import cluster
+      import cupy as cp
       sys.stderr.write('Fitting HDBSCAN model using a GPU\n')
       hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples,
                                  output_type = 'cupy',
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index e7dadc83..c5a5edb3 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -509,9 +509,22 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023)
         min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10)
 
+        # Check on initialisation of GPU libraries and memory
         # Convert to cupy if using GPU to avoid implicit numpy conversion below
         if use_gpu:
-            self.subsampled_X = cp.asarray(self.subsampled_X.shape)
+            try:
+                import cudf
+                from cuml import cluster
+                import cupy as cp
+                gpu_lib = True
+            except ImportError as e:
+                gpu_lib = False
+            # check on GPU
+            use_gpu = check_and_set_gpu(use_gpu,
+                                gpu_lib,
+                                quit_on_fail = True)
+            if use_gpu:
+                self.subsampled_X = cp.asarray(self.subsampled_X)
 
         indistinct_clustering = True
         while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10:
@@ -534,9 +547,10 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                   self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float)
 
                   for i in range(self.max_cluster_num+1):
-                      self.cluster_means[i,] = [cp.mean(self.subsampled_X[self.labels==i,0]),cp.mean(self.subsampled_X[self.labels==i,1])]
-                      self.cluster_mins[i,] = [cp.min(self.subsampled_X[self.labels==i,0]),cp.min(self.subsampled_X[self.labels==i,1])]
-                      self.cluster_maxs[i,] = [cp.max(self.subsampled_X[self.labels==i,0]),cp.max(self.subsampled_X[self.labels==i,1])]
+                      labelled_rows = cp.where(self.labels==i,True,False)
+                      self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,0]),cp.mean(self.subsampled_X[labelled_rows,1])]
+                      self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,0]),cp.min(self.subsampled_X[labelled_rows,1])]
+                      self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,0]),cp.max(self.subsampled_X[labelled_rows,1])]
 
                   y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
                   

From c5ab3d8ecafdb3002c945a60e9366298d81c8dc0 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 10:39:30 +0000
Subject: [PATCH 15/62] Cupy indexing fix

---
 PopPUNK/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index c5a5edb3..009a0e5d 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -548,9 +548,9 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
 
                   for i in range(self.max_cluster_num+1):
                       labelled_rows = cp.where(self.labels==i,True,False)
-                      self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,0]),cp.mean(self.subsampled_X[labelled_rows,1])]
-                      self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,0]),cp.min(self.subsampled_X[labelled_rows,1])]
-                      self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,0]),cp.max(self.subsampled_X[labelled_rows,1])]
+                      self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])]
+                      self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])]
+                      self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])]
 
                   y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
                   

From 50ac74515eff005163d6d73c01d04dbb27598f1f Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 10:44:37 +0000
Subject: [PATCH 16/62] Cupy indexing fix again

---
 PopPUNK/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 009a0e5d..886c1e9c 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -548,9 +548,9 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
 
                   for i in range(self.max_cluster_num+1):
                       labelled_rows = cp.where(self.labels==i,True,False)
-                      self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])]
-                      self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])]
-                      self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])]
+                      self.cluster_means[cp.array(i),] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])]
+                      self.cluster_mins[cp.array(i),] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])]
+                      self.cluster_maxs[cp.array(i),] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])]
 
                   y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
                   

From 507cfa3f35310e84819ae71cf7edb7af95c6f74a Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 10:55:53 +0000
Subject: [PATCH 17/62] Fix prediction parsing

---
 PopPUNK/models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 886c1e9c..50eedd95 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -691,8 +691,7 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False):
                 sys.stderr.write("Assigning distances with DBSCAN model\n")
 
             if use_gpu:
-              X_assignments = cluster.hdbscan.approximate_predict(self.hdb, X)
-              y = X_assignments.labels
+              y, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X)
             else:
               y = np.zeros(X.shape[0], dtype=int)
               block_size = 5000

From da70f27814fbcd79a6094c0d38dc51349aefcf0d Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:01:07 +0000
Subject: [PATCH 18/62] Ignore CPU only cache

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 50eedd95..d4ffad66 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -583,7 +583,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
             self.fitted = False
             sys.stderr.write("Failed to find distinct clusters in this dataset\n")
             sys.exit(1)
-        else:
+        else if not use_gpu:
             shutil.rmtree(cache_out)
 
         y = self.assign(X)

From 5651d2f7b8ed60ba556247b50a850351f6fdddbc Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:01:50 +0000
Subject: [PATCH 19/62] Use python

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index d4ffad66..707f12d7 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -583,7 +583,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
             self.fitted = False
             sys.stderr.write("Failed to find distinct clusters in this dataset\n")
             sys.exit(1)
-        else if not use_gpu:
+        elif not use_gpu:
             shutil.rmtree(cache_out)
 
         y = self.assign(X)

From b5abfdc26e91d04bc4c15cd1945276c8b2814e05 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:17:58 +0000
Subject: [PATCH 20/62] Assign by GPU in blocks

---
 PopPUNK/models.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 707f12d7..d8f9655f 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -691,7 +691,15 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False):
                 sys.stderr.write("Assigning distances with DBSCAN model\n")
 
             if use_gpu:
-              y, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X)
+              y = np.zeros(X.shape[0], dtype=int)
+              block_size = 500000
+              n_blocks = (X.shape[0] - 1) // block_size + 1
+              for block in range(n_blocks):
+                  start_index = block*block_size
+                  end_index = max((block+1)*block_size,X.shape[0])
+                  sys.stderr.write("Processing rows " + str(start_index) + " to " + str(end_index) + "\n")
+                  y_assignments, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X[start_index:end_index,])
+                  y[start_index:end_index] = cp.asnumpy(y_assignments)
             else:
               y = np.zeros(X.shape[0], dtype=int)
               block_size = 5000

From aafaa1cbeb7d8bac6fefb6ce1b9d2f9786cf02df Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:24:48 +0000
Subject: [PATCH 21/62] Specify GPU assignment

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index d8f9655f..6e47218c 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -586,7 +586,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         elif not use_gpu:
             shutil.rmtree(cache_out)
 
-        y = self.assign(X)
+        y = self.assign(X, use_gpu = use_gpu)
         return y
 
 

From e6cbf901c7fa8c8e82fec2e3e9c89d79521e5504 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:30:52 +0000
Subject: [PATCH 22/62] Improve indices

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 6e47218c..aa05b325 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -696,7 +696,7 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False):
               n_blocks = (X.shape[0] - 1) // block_size + 1
               for block in range(n_blocks):
                   start_index = block*block_size
-                  end_index = max((block+1)*block_size,X.shape[0])
+                  end_index = min((block+1)*block_size-1,X.shape[0])
                   sys.stderr.write("Processing rows " + str(start_index) + " to " + str(end_index) + "\n")
                   y_assignments, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X[start_index:end_index,])
                   y[start_index:end_index] = cp.asnumpy(y_assignments)

From 539d939bed1aa3d3cd72d38499bd9b54f1c835ad Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:37:55 +0000
Subject: [PATCH 23/62] Allow flexible assignment batch size

---
 PopPUNK/models.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index aa05b325..5849eb4e 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -586,7 +586,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         elif not use_gpu:
             shutil.rmtree(cache_out)
 
-        y = self.assign(X, use_gpu = use_gpu)
+        y = self.assign(X, max_batch_size = max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu)
         return y
 
 
@@ -661,7 +661,7 @@ def plot(self, X=None, y=None):
                             self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan")
 
 
-    def assign(self, X, no_scale = False, progress = True, use_gpu = False):
+    def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False):
         '''Assign the clustering of new samples using :func:`~PopPUNK.dbscan.assign_samples_dbscan`
 
         Args:
@@ -673,6 +673,9 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False):
             progress (bool)
                 Show progress bar
                 [default = True]
+            max_batch_size (int)
+                Batch size used for GPU assignments
+                [default = 5000]
             use_gpu (bool)
                 Use GPU-enabled algorithms for clustering
                 [default = False]
@@ -692,7 +695,7 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False):
 
             if use_gpu:
               y = np.zeros(X.shape[0], dtype=int)
-              block_size = 500000
+              block_size = max_batch_size
               n_blocks = (X.shape[0] - 1) // block_size + 1
               for block in range(n_blocks):
                   start_index = block*block_size

From 176b3230f58731cde3ed31984d06abc71dfec28a Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 11:39:17 +0000
Subject: [PATCH 24/62] Argument typo

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 5849eb4e..b68e6679 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -586,7 +586,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         elif not use_gpu:
             shutil.rmtree(cache_out)
 
-        y = self.assign(X, max_batch_size = max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu)
+        y = self.assign(X, max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu)
         return y
 
 

From 8ebbb8936913ae2235690df665b13c95aee0a7a4 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Jan 2024 12:06:51 +0000
Subject: [PATCH 25/62] Fix assignment command

---
 PopPUNK/models.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index b68e6679..498b8111 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -552,7 +552,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                       self.cluster_mins[cp.array(i),] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])]
                       self.cluster_maxs[cp.array(i),] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])]
 
-                  y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True)
+                  y = self.assign(self.subsampled_X,
+                                  no_scale=True,
+                                  progress=False,
+                                  max_batch_size = self.subsampled_X.shape[0],
+                                  use_gpu = True)
                   
               else:
                   # get within strain cluster
@@ -586,7 +590,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         elif not use_gpu:
             shutil.rmtree(cache_out)
 
-        y = self.assign(X, max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu)
+        y = self.assign(X, max_batch_size = self.subsampled_X.shape[0], use_gpu = use_gpu)
         return y
 
 

From 8511ae8241cc9cb2bfe62cafaff0856a77131287 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Sun, 21 Jan 2024 13:08:24 +0000
Subject: [PATCH 26/62] Separate fitting and assigning batch size

---
 PopPUNK/__main__.py | 4 +++-
 PopPUNK/models.py   | 9 +++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 75b7884c..f53d6635 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -116,6 +116,8 @@ def get_options():
     modelGroup = parser.add_argument_group('Model fit options')
     modelGroup.add_argument('--model-subsample', help='Number of pairwise distances used to fit model [default = 100000]',
                             type=int, default=100000)
+    modelGroup.add_argument('--assign-subsample', help='Number of pairwise distances in each assignment batch [default = 5000]',
+                            type=int, default=5000)
     modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
     modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
     modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
@@ -494,7 +496,7 @@ def main():
         if args.fit_model:
             # Run DBSCAN model
             if args.fit_model == "dbscan":
-                model = DBSCANFit(output, max_samples = args.model_subsample)
+                model = DBSCANFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model)
             # Run Gaussian model
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 498b8111..1a0a3ca8 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -678,7 +678,7 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us
                 Show progress bar
                 [default = True]
             max_batch_size (int)
-                Batch size used for GPU assignments
+                Batch size used for assignments
                 [default = 5000]
             use_gpu (bool)
                 Use GPU-enabled algorithms for clustering
@@ -696,10 +696,12 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us
                 scale = self.scale
             if progress:
                 sys.stderr.write("Assigning distances with DBSCAN model\n")
-
+            
+            # Set block size
+            block_size = max_batch_size
+            
             if use_gpu:
               y = np.zeros(X.shape[0], dtype=int)
-              block_size = max_batch_size
               n_blocks = (X.shape[0] - 1) // block_size + 1
               for block in range(n_blocks):
                   start_index = block*block_size
@@ -709,7 +711,6 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us
                   y[start_index:end_index] = cp.asnumpy(y_assignments)
             else:
               y = np.zeros(X.shape[0], dtype=int)
-              block_size = 5000
               n_blocks = (X.shape[0] - 1) // block_size + 1
               with SharedMemoryManager() as smm:
                   shm_X = smm.SharedMemory(size = X.nbytes)

From 03aef56019d150738bc4561ad3cfc9e9ace380e5 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Sun, 21 Jan 2024 13:18:55 +0000
Subject: [PATCH 27/62] Parse batch size

---
 PopPUNK/models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 1a0a3ca8..64826282 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -472,10 +472,11 @@ class DBSCANFit(ClusterFit):
             (default = 100000)
     '''
 
-    def __init__(self, outPrefix, max_samples = 100000):
+    def __init__(self, outPrefix, max_batch_size = 5000, max_samples = 100000):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'dbscan'
         self.preprocess = True
+        self.max_batch_size = max_batch_size
         self.max_samples = max_samples
 
 
@@ -590,7 +591,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         elif not use_gpu:
             shutil.rmtree(cache_out)
 
-        y = self.assign(X, max_batch_size = self.subsampled_X.shape[0], use_gpu = use_gpu)
+        y = self.assign(X, max_batch_size = self.max_batch_size, use_gpu = use_gpu)
         return y
 
 

From fadc54da30869292ff6327caf71c49de018ce94b Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Sun, 21 Jan 2024 17:30:13 +0000
Subject: [PATCH 28/62] Use cupy in plot function

---
 PopPUNK/models.py | 7 ++++---
 PopPUNK/plot.py   | 9 +++++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 64826282..9c8fde61 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -472,13 +472,13 @@ class DBSCANFit(ClusterFit):
             (default = 100000)
     '''
 
-    def __init__(self, outPrefix, max_batch_size = 5000, max_samples = 100000):
+    def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_samples = 100000):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'dbscan'
         self.preprocess = True
         self.max_batch_size = max_batch_size
         self.max_samples = max_samples
-
+        self.use_gpu = use_gpu
 
     def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         '''Extends :func:`~ClusterFit.fit`
@@ -663,7 +663,8 @@ def plot(self, X=None, y=None):
         plot_dbscan_results(self.subsampled_X * self.scale,
                             self.assign(self.subsampled_X, no_scale=True, progress=False),
                             self.n_clusters,
-                            self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan")
+                            self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan",
+                            self.use_gpu)
 
 
     def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False):
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index faf5ac37..22e639b3 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -182,7 +182,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix):
     plt.savefig(out_prefix + ".png")
     plt.close()
 
-def plot_dbscan_results(X, y, n_clusters, out_prefix):
+def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu):
     """Draw a scatter plot (png) to show the DBSCAN model fit
 
     A scatter plot of core and accessory distances, coloured by component
@@ -197,10 +197,15 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix):
             Number of clusters used (excluding noise)
         out_prefix (str)
             Prefix for output file (.png will be appended)
+        use_gpu (bool)
+            Whether model was fitted with GPU or not
     """
     # Black removed and is used for noise instead.
     unique_labels = set(y)
-    colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters
+    if use_gpu:
+      colours = [plt.cm.Spectral(each) for each in cp.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters
+    else:
+      colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters
 
     fig=plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k')
     for k in unique_labels:

From bceaa691977930d46b578011b94306a592227d95 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Sun, 21 Jan 2024 19:38:15 +0000
Subject: [PATCH 29/62] Convert to numpy before plotting

---
 PopPUNK/models.py | 13 ++++++++++---
 PopPUNK/plot.py   |  9 ++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 9c8fde61..82cb2338 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -478,7 +478,7 @@ def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_sample
         self.preprocess = True
         self.max_batch_size = max_batch_size
         self.max_samples = max_samples
-        self.use_gpu = use_gpu
+        self.use_gpu = use_gpu # Updated below
 
     def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         '''Extends :func:`~ClusterFit.fit`
@@ -525,7 +525,10 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                                 gpu_lib,
                                 quit_on_fail = True)
             if use_gpu:
+                self.use_gpu = True
                 self.subsampled_X = cp.asarray(self.subsampled_X)
+            else
+                self.use_gpu = False
 
         indistinct_clustering = True
         while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10:
@@ -660,11 +663,15 @@ def plot(self, X=None, y=None):
             sys.stderr.write("\t" + str(centre) + "\n")
         sys.stderr.write("\n")
 
+        # Convert to numpy for plotting
+        if self.use_gpu:
+            import cupy as cp
+            self.subsampled_X = cp.asnumpy(self.scale)
+
         plot_dbscan_results(self.subsampled_X * self.scale,
                             self.assign(self.subsampled_X, no_scale=True, progress=False),
                             self.n_clusters,
-                            self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan",
-                            self.use_gpu)
+                            self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan")
 
 
     def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False):
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 22e639b3..faf5ac37 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -182,7 +182,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix):
     plt.savefig(out_prefix + ".png")
     plt.close()
 
-def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu):
+def plot_dbscan_results(X, y, n_clusters, out_prefix):
     """Draw a scatter plot (png) to show the DBSCAN model fit
 
     A scatter plot of core and accessory distances, coloured by component
@@ -197,15 +197,10 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu):
             Number of clusters used (excluding noise)
         out_prefix (str)
             Prefix for output file (.png will be appended)
-        use_gpu (bool)
-            Whether model was fitted with GPU or not
     """
     # Black removed and is used for noise instead.
     unique_labels = set(y)
-    if use_gpu:
-      colours = [plt.cm.Spectral(each) for each in cp.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters
-    else:
-      colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters
+    colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters
 
     fig=plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k')
     for k in unique_labels:

From a19940364f05d84c8ba00cd8d2c4132477759e98 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Sun, 21 Jan 2024 19:39:12 +0000
Subject: [PATCH 30/62] Add colon

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 82cb2338..2aa4df9a 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -527,7 +527,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
             if use_gpu:
                 self.use_gpu = True
                 self.subsampled_X = cp.asarray(self.subsampled_X)
-            else
+            else:
                 self.use_gpu = False
 
         indistinct_clustering = True

From 20c7bb6dc2a173d5d75c22c37cdf78a299fcc40d Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Sun, 21 Jan 2024 19:47:41 +0000
Subject: [PATCH 31/62] Specify add GPU

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 2aa4df9a..02220492 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -669,7 +669,7 @@ def plot(self, X=None, y=None):
             self.subsampled_X = cp.asnumpy(self.scale)
 
         plot_dbscan_results(self.subsampled_X * self.scale,
-                            self.assign(self.subsampled_X, no_scale=True, progress=False),
+                            self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu),
                             self.n_clusters,
                             self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan")
 

From 0f04894b5b820e97a0da1748a74ce46f042dc66c Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Jan 2024 05:15:37 +0000
Subject: [PATCH 32/62] Move cupy conversion

---
 PopPUNK/models.py |  8 ++------
 PopPUNK/plot.py   | 10 +++++++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 02220492..87f68fa7 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -663,15 +663,11 @@ def plot(self, X=None, y=None):
             sys.stderr.write("\t" + str(centre) + "\n")
         sys.stderr.write("\n")
 
-        # Convert to numpy for plotting
-        if self.use_gpu:
-            import cupy as cp
-            self.subsampled_X = cp.asnumpy(self.scale)
-
         plot_dbscan_results(self.subsampled_X * self.scale,
                             self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu),
                             self.n_clusters,
-                            self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan")
+                            self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan",
+                            self.use_gpu)
 
 
     def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False):
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index faf5ac37..a0f5d226 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -182,7 +182,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix):
     plt.savefig(out_prefix + ".png")
     plt.close()
 
-def plot_dbscan_results(X, y, n_clusters, out_prefix):
+def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu):
     """Draw a scatter plot (png) to show the DBSCAN model fit
 
     A scatter plot of core and accessory distances, coloured by component
@@ -197,7 +197,15 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix):
             Number of clusters used (excluding noise)
         out_prefix (str)
             Prefix for output file (.png will be appended)
+        use_gpu (bool)
+            Whether model was fitted with GPU-enabled code
     """
+    # Convert data if from GPU
+    if use_gpu:
+        # Convert to numpy for plotting
+        import cupy as cp
+        X = cp.asnumpy(X)
+    
     # Black removed and is used for noise instead.
     unique_labels = set(y)
     colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  # changed to work with two clusters

From bf5d65714304e93edf951b901817baa87525909b Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Jan 2024 05:25:35 +0000
Subject: [PATCH 33/62] Add cupy scaling

---
 PopPUNK/models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 87f68fa7..a177d909 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -663,6 +663,11 @@ def plot(self, X=None, y=None):
             sys.stderr.write("\t" + str(centre) + "\n")
         sys.stderr.write("\n")
 
+        # Harmonise scales
+        if self.use_gpu:
+            import cupy as cp
+            self.scale = cp.asarray(self.scale)
+
         plot_dbscan_results(self.subsampled_X * self.scale,
                             self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu),
                             self.n_clusters,

From 3fe3529253c870478724566309371270eb34cc3d Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 16:29:29 +0000
Subject: [PATCH 34/62] Update use of cugraph functions

---
 PopPUNK/network.py | 55 +++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 244e9e89..b6da9a1b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -225,7 +225,7 @@ def translate_network_indices(G_ref_df, reference_indices):
     # Translate network indices to match name order
     G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()]
     G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()]
-    G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True)
+    G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True)
     return(G_ref)
 
 def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None,
@@ -881,7 +881,7 @@ def construct_network_from_df(rlist,
         use_weights = False
         if weights:
             use_weights = True
-        G = add_self_loop(G_df, max_in_vertex_labels, weights = use_weights, renumber = False)
+        G = generate_cugraph(G_df, max_in_vertex_labels, weights = use_weights, renumber = False)
     else:
         # Convert bool to list of weights or None
         if weights:
@@ -1009,7 +1009,7 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp
         G_df['destination'] = [edge_list[0][1]]
         G_df['weights'] = weights
         max_in_vertex_labels = len(vertex_labels)-1
-        G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False)
+        G = generate_cugraph(G_df, max_in_vertex_labels, weights = True, renumber = False)
     else:
         # Construct network with CPU via edge list
         weighted_edges = []
@@ -1125,8 +1125,9 @@ def get_cugraph_triangles(G):
     """
     nlen = G.number_of_vertices()
     df = G.view_edge_list()
+    df = df.drop(df[df.source == df.destination].index)
     A = cp.full((nlen, nlen), 0, dtype = cp.int32)
-    A[df.src.values, df.dst.values] = 1
+    A[df.source.values, df.destination.values] = 1
     A = cp.maximum( A, A.transpose() )
     triangle_count = int(cp.around(cp.trace(cp.matmul(A, cp.matmul(A, A)))/6,0))
     return triangle_count
@@ -1155,11 +1156,9 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa
         component_nums = component_assignments['labels'].unique().astype(int)
         components = len(component_nums)
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
-        # consistent with graph-tool for small graphs - triangle counts differ for large graphs
-        # could reflect issue https://github.com/rapidsai/cugraph/issues/1043
-        # this command can be restored once the above issue is fixed - scheduled for cugraph 0.20
-#        triangle_count = cugraph.community.triangle_count.triangles(G)/3
-        triangle_count = 3*get_cugraph_triangles(G)
+        # need to check consistent with graph-tool
+        triangle_count = cugraph.triangle_count(G)/3
+#        triangle_count = 3*get_cugraph_triangles(G)
         degree_df = G.in_degree()
         # consistent with graph-tool
         triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()])
@@ -1353,8 +1352,8 @@ def addQueryToNetwork(dbFuncs, rList, qList, G,
 
     return G, qqDistMat
 
-def add_self_loop(G_df, seq_num, weights = False, renumber = True):
-    """Adds self-loop to cugraph graph to ensure all nodes are included in
+def generate_cugraph(G_df, seq_num, weights = False, renumber = True):
+    """Builds cugraph graph to ensure all nodes are included in
     the graph, even if singletons.
 
     Args:
@@ -1370,28 +1369,20 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
-    min_in_df = np.amin([G_df['source'].min(), G_df['destination'].min()])
-    if min_in_df.item() > 0:
-        G_self_loop = cudf.DataFrame()
-        G_self_loop['source'] = [0]
-        G_self_loop['destination'] = [0]
-        if weights:
-            G_self_loop['weights'] = 0.0
-        G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
-    max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
-    if max_in_df.item() != seq_num:
-        G_self_loop = cudf.DataFrame()
-        G_self_loop['source'] = [seq_num]
-        G_self_loop['destination'] = [seq_num]
-        if weights:
-            G_self_loop['weights'] = 0.0
-        G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
+    node_indices = cudf.Series(range(seq_num), dtype = cp.int32)
+    G_self_loop = cudf.DataFrame()
+    G_self_loop['source'] = node_indices
+    G_self_loop['destination'] = node_indices
+    G_self_loop['weights'] = 0.0
+    # Build cudf
+    # Weights are needed to extract subgraphs
+    # see https://github.com/rapidsai/cugraph/blob/77d833ad/python/cugraph/cugraph/community/subgraph_extraction.py
+    if not weights:
+        G_df['weights'] = 1.0
+    G_df = cudf.concat([G_self_loop,G_df], ignore_index = True)
     # Construct graph
     G_new = cugraph.Graph()
-    if weights:
-        G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber)
-    else:
-        G_new.from_cudf_edgelist(G_df, renumber = renumber)
+    G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber)
     return G_new
 
 
@@ -1847,7 +1838,7 @@ def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False):
         G_df['destination'] = sparse_mat.col
         G_df['weights'] = sparse_mat.data
         max_in_vertex_labels = len(rlist)-1
-        G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False)
+        G = generate_cugraph(G_df, max_in_vertex_labels, weights = True, renumber = False)
     else:
         connections = []
         for (src,dst) in zip(sparse_mat.row,sparse_mat.col):

From e7dba83a1a0c655f17cc5c7f80f77f1078f70206 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 16:32:21 +0000
Subject: [PATCH 35/62] Update function name

---
 PopPUNK/refine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 3239bb0a..70b2e7ca 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -42,7 +42,7 @@
 from .network import construct_network_from_df, printClusters
 from .network import construct_network_from_edge_list
 from .network import networkSummary
-from .network import add_self_loop
+from .network import generate_cugraph
 
 from .utils import transformLine
 from .utils import decisionBoundary

From 89e5ab8a870fb808d0f272bf21244d4271a126bd Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 16:52:29 +0000
Subject: [PATCH 36/62] Update network functions

---
 PopPUNK/network.py | 34 ++++++----------------------------
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b6da9a1b..2cd48207 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -678,7 +678,7 @@ def construct_network_from_edge_list(rlist,
                                         betweenness_sample = betweenness_sample_default,
                                         summarise = True,
                                         use_gpu = False):
-    """Construct an undirected network using a data frame of edges. Nodes are samples and
+    """Construct an undirected network using a list of edges as tuples. Nodes are samples and
     edges where samples are within the same cluster
 
     Will print summary statistics about the network to ``STDERR``
@@ -688,8 +688,8 @@ def construct_network_from_edge_list(rlist,
             List of reference sequence labels
         qlist (list)
             List of query sequence labels
-        G_df (cudf or pandas data frame)
-            Data frame in which the first two columns are the nodes linked by edges
+        edge_list (list of tuples)
+            List of tuples describing the edges of the graph
         weights (list)
             List of edge weights
         distMat (2 column ndarray)
@@ -735,7 +735,7 @@ def construct_network_from_edge_list(rlist,
         else:
             # Cannot generate an array when one edge
             G_df = cudf.DataFrame(columns = ['source','destination'])
-            G_df['source'] = [edge_list[0][0]]
+            G_df['source'] = [edge_list[0][0]] # This does not work
             G_df['destination'] = [edge_list[0][1]]
         if weights is not None:
             G_df['weights'] = weights
@@ -1110,28 +1110,6 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label =
 
     return G
 
-def get_cugraph_triangles(G):
-    """Counts the number of triangles in a cugraph
-    network. Can be removed when the cugraph issue
-    https://github.com/rapidsai/cugraph/issues/1043 is fixed.
-
-    Args:
-        G (cugraph network)
-            Network to be analysed
-
-    Returns:
-        triangle_count (int)
-            Count of triangles in graph
-    """
-    nlen = G.number_of_vertices()
-    df = G.view_edge_list()
-    df = df.drop(df[df.source == df.destination].index)
-    A = cp.full((nlen, nlen), 0, dtype = cp.int32)
-    A[df.source.values, df.destination.values] = 1
-    A = cp.maximum( A, A.transpose() )
-    triangle_count = int(cp.around(cp.trace(cp.matmul(A, cp.matmul(A, A)))/6,0))
-    return triangle_count
-
 def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sample_default,
                     use_gpu = False):
     """Provides summary values about the network
@@ -1157,8 +1135,8 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa
         components = len(component_nums)
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         # need to check consistent with graph-tool
-        triangle_count = cugraph.triangle_count(G)/3
-#        triangle_count = 3*get_cugraph_triangles(G)
+        triangle_counts = cugraph.triangle_count(G)
+        triangle_count = triangle_counts['counts'].sum()/3
         degree_df = G.in_degree()
         # consistent with graph-tool
         triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()])

From 2931541f19096a0374588a5ab9d02add11d14b10 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 17:45:07 +0000
Subject: [PATCH 37/62] Improve error reporting

---
 PopPUNK/network.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 2cd48207..3df5719b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -162,6 +162,8 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
     networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
     if len(networkMissing) > 0:
         sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n")
+        if len(networkMissing) < 10:
+            sys.stderr.write('Missing isolate are: ' + ' '.join(networkMissing))
         sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):
@@ -732,11 +734,14 @@ def construct_network_from_edge_list(rlist,
             edge_array = cp.array(edge_list, dtype = np.int32)
             edge_gpu_matrix = cuda.to_device(edge_array)
             G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination'])
-        else:
+        elif len(edge_list) == 1:
             # Cannot generate an array when one edge
             G_df = cudf.DataFrame(columns = ['source','destination'])
-            G_df['source'] = [edge_list[0][0]] # This does not work
+            G_df['source'] = [edge_list[0][0]]
             G_df['destination'] = [edge_list[0][1]]
+        else:
+            sys.stderr.write('ERROR: Missing link in graph assignment')
+            sys.exit()
         if weights is not None:
             G_df['weights'] = weights
         G = construct_network_from_df(rlist, qlist, G_df,

From 3e923fdbf97c62ccff2fece902d4c8faf0f0e72d Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 17:52:27 +0000
Subject: [PATCH 38/62] Fix reporting bug

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 3df5719b..c1642183 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -163,7 +163,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
     if len(networkMissing) > 0:
         sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n")
         if len(networkMissing) < 10:
-            sys.stderr.write('Missing isolate are: ' + ' '.join(networkMissing))
+            sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing]))
         sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):

From 72de66bde237959f93ed842a7153879acafb0200 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 19:10:31 +0000
Subject: [PATCH 39/62] More information on missing isolates

---
 PopPUNK/network.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index c1642183..9b420a4e 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -162,8 +162,11 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
     networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
     if len(networkMissing) > 0:
         sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n")
-        if len(networkMissing) < 10:
-            sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing]))
+        if len(networkMissing) == 1:
+          sys.stderr.write('Missing isolate is: ' + seq_list[networkMissing[0]] + ' (index ' + str(networkMissing[0]) + ')')
+        elif len(networkMissing) < 10:
+          sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing]))
+          sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing]))
         sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):

From 8cb9f5191c53f538a41b4b598f9046cf11177523 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 19:29:07 +0000
Subject: [PATCH 40/62] Fix set processing

---
 PopPUNK/network.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9b420a4e..4e0e5594 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -163,10 +163,11 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
     if len(networkMissing) > 0:
         sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n")
         if len(networkMissing) == 1:
-          sys.stderr.write('Missing isolate is: ' + seq_list[networkMissing[0]] + ' (index ' + str(networkMissing[0]) + ')')
+            missing_isolate_index = networkMissing.pop()
+            sys.stderr.write('Missing isolate is: ' + seq_list[missing_isolate_index] + ' (index ' + str(missing_isolate_index) + ')')
         elif len(networkMissing) < 10:
-          sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing]))
-          sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing]))
+            sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing]))
+            sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing]))
         sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):

From 8ef479031f2064d10d1eb6815a01776798fd8122 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 19:46:26 +0000
Subject: [PATCH 41/62] Avoid omitting nodes

---
 PopPUNK/network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4e0e5594..28660dd2 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1339,15 +1339,15 @@ def addQueryToNetwork(dbFuncs, rList, qList, G,
 
     return G, qqDistMat
 
-def generate_cugraph(G_df, seq_num, weights = False, renumber = True):
+def generate_cugraph(G_df, max_index, weights = False, renumber = True):
     """Builds cugraph graph to ensure all nodes are included in
     the graph, even if singletons.
 
     Args:
         G_df (cudf)
             cudf data frame containing edge list
-        seq_num (int)
-            The expected number of nodes in the graph
+        max_index (int)
+            The 0-indexed maximum of the node indices
         renumber (bool)
             Whether to renumber the vertices when added to the graph
 
@@ -1356,7 +1356,7 @@ def generate_cugraph(G_df, seq_num, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
-    node_indices = cudf.Series(range(seq_num), dtype = cp.int32)
+    node_indices = cudf.Series(range(seq_num+1), dtype = cp.int32)
     G_self_loop = cudf.DataFrame()
     G_self_loop['source'] = node_indices
     G_self_loop['destination'] = node_indices

From 2d458659125fd29aba5413f948393454ac106631 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 19:52:53 +0000
Subject: [PATCH 42/62] Fix variable name

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 28660dd2..f0839d02 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1356,7 +1356,7 @@ def generate_cugraph(G_df, max_index, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
-    node_indices = cudf.Series(range(seq_num+1), dtype = cp.int32)
+    node_indices = cudf.Series(range(max_index+1), dtype = cp.int32)
     G_self_loop = cudf.DataFrame()
     G_self_loop['source'] = node_indices
     G_self_loop['destination'] = node_indices

From e5f5b9a5e42bb5ddd56e44fa66d9a2ccb0ea3005 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 21:18:21 +0000
Subject: [PATCH 43/62] Add BGMM max batch size

---
 PopPUNK/__main__.py |  2 +-
 PopPUNK/models.py   | 22 +++++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index f53d6635..7dff02f0 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -501,7 +501,7 @@ def main():
                 assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model)
             # Run Gaussian model
             elif args.fit_model == "bgmm":
-                model = BGMMFit(output, max_samples = args.model_subsample)
+                model = BGMMFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.K)
             elif args.fit_model == "refine":
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index a177d909..f84d2b3d 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -293,12 +293,12 @@ class BGMMFit(ClusterFit):
             (default = 100000)
     '''
 
-    def __init__(self, outPrefix, max_samples = 100000):
+    def __init__(self, outPrefix, max_samples = 100000, max_batch_size = 100000):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'bgmm'
         self.preprocess = True
         self.max_samples = max_samples
-
+        self.max_batch_size = max_batch_size
 
     def fit(self, X, max_components):
         '''Extends :func:`~ClusterFit.fit`
@@ -326,7 +326,7 @@ def fit(self, X, max_components):
         self.covariances = self.dpgmm.covariances_
         self.fitted = True
 
-        y = self.assign(X)
+        y = self.assign(X, max_batch_size = self.max_batch_size)
         self.within_label = findWithinLabel(self.means, y)
         self.between_label = findWithinLabel(self.means, y, 1)
         return y
@@ -384,7 +384,7 @@ def plot(self, X, y):
         if not hasattr(self, 'subsampled_X'):
             self.subsampled_X = utils.shuffle(X, random_state=random.randint(1,10000))[0:self.max_samples,]
 
-        y_subsample = self.assign(self.subsampled_X, values=True, progress=False)
+        y_subsample = self.assign(self.subsampled_X, max_batch_size = self.max_batch_size, values=True, progress=False)
         avg_entropy = np.mean(np.apply_along_axis(stats.entropy, 1,
                                                   y_subsample))
         used_components = np.unique(y).size
@@ -402,7 +402,7 @@ def plot(self, X, y):
         plot_contours(self, y, title + " assignment boundary", outfile + "_contours")
 
 
-    def assign(self, X, values = False, progress=True):
+    def assign(self, X, max_batch_size = 100000, values = False, progress=True):
         '''Assign the clustering of new samples using :func:`~PopPUNK.bgmm.assign_samples`
 
         Args:
@@ -410,8 +410,8 @@ def assign(self, X, values = False, progress=True):
                 Core and accessory distances
             values (bool)
                 Return the responsibilities of assignment rather than most likely cluster
-            num_processes (int)
-                Number of threads to use
+            max_batch_size (int)
+                Size of batches to be assigned
             progress (bool)
                 Show progress bar
 
@@ -430,7 +430,7 @@ def assign(self, X, values = False, progress=True):
                 y = np.zeros((X.shape[0], len(self.weights)), dtype=X.dtype)
             else:
                 y = np.zeros(X.shape[0], dtype=int)
-            block_size = 100000
+            block_size = max_batch_size
             with SharedMemoryManager() as smm:
                 shm_X = smm.SharedMemory(size = X.nbytes)
                 X_shared_array = np.ndarray(X.shape, dtype = X.dtype, buffer = shm_X.buf)
@@ -574,7 +574,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                       self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])]
                       self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])]
 
-                  y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = False)
+                  y = self.assign(self.subsampled_X,
+                                  no_scale=True,
+                                  progress=False,
+                                  max_batch_size = self.subsampled_X.shape[0],
+                                  use_gpu = False)
               
               # Evaluate clustering
               self.within_label = findWithinLabel(self.cluster_means, y)

From 758e0b122fb8988e8f5a7e2de5d4aa20b806b0c1 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Jan 2024 21:35:44 +0000
Subject: [PATCH 44/62] Update assignment in fitting

---
 PopPUNK/models.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index f84d2b3d..2be23e3b 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -673,7 +673,11 @@ def plot(self, X=None, y=None):
             self.scale = cp.asarray(self.scale)
 
         plot_dbscan_results(self.subsampled_X * self.scale,
-                            self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu),
+                            self.assign(self.subsampled_X,
+                                        max_batch_size = self.max_batch_size,
+                                        no_scale=True,
+                                        progress=False,
+                                        use_gpu=self.use_gpu),
                             self.n_clusters,
                             self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan",
                             self.use_gpu)

From 3c92279d9efb9d4388a05adf2acbeea8bad2a422 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Wed, 24 Jan 2024 06:12:24 +0000
Subject: [PATCH 45/62] GPU specification in save and load

---
 PopPUNK/models.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 2be23e3b..b4d308f8 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -614,7 +614,8 @@ def save(self):
              means=self.cluster_means,
              maxs=self.cluster_maxs,
              mins=self.cluster_mins,
-             scale=self.scale)
+             scale=self.scale,
+             use_gpu=self.use_gpu)
             with open(self.outPrefix + "/" + os.path.basename(self.outPrefix) + '_fit.pkl', 'wb') as pickle_file:
                 pickle.dump([self.hdb, self.type], pickle_file)
 
@@ -637,6 +638,8 @@ def load(self, fit_npz, fit_obj):
         self.cluster_means = fit_npz['means']
         self.cluster_maxs = fit_npz['maxs']
         self.cluster_mins = fit_npz['mins']
+        self.scale = fit_npz['scale']
+        self.use_gpu = fit_npz['use_gpu']
         self.fitted = True
 
 
@@ -687,7 +690,7 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us
         '''Assign the clustering of new samples using :func:`~PopPUNK.dbscan.assign_samples_dbscan`
 
         Args:
-            X (numpy.array)
+            X (numpy.array or cupy.array)
                 Core and accessory distances
             no_scale (bool)
                 Do not scale X

From e857fca3f729647d2f955e18e328f1ae1b5491e9 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Wed, 24 Jan 2024 06:38:07 +0000
Subject: [PATCH 46/62] Explicit numpy conversion in predict

---
 PopPUNK/models.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index b4d308f8..d2d8f2aa 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -728,8 +728,11 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us
                   start_index = block*block_size
                   end_index = min((block+1)*block_size-1,X.shape[0])
                   sys.stderr.write("Processing rows " + str(start_index) + " to " + str(end_index) + "\n")
-                  y_assignments, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X[start_index:end_index,])
-                  y[start_index:end_index] = cp.asnumpy(y_assignments)
+                  # cuml v24.02 always returns numpy therefore make conversion explicit
+                  y[start_index:end_index], y_probabilities = cluster.hdbscan.approximate_predict(self.hdb,
+                                                                                                  X[start_index:end_index,],
+                                                                                                  convert_dtype = True)
+                  del y_probabilities
             else:
               y = np.zeros(X.shape[0], dtype=int)
               n_blocks = (X.shape[0] - 1) // block_size + 1

From 101a910f6869962a8f60126e26f33d3785b016e6 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 08:00:49 +0000
Subject: [PATCH 47/62] Enable fitting without assigning all points

---
 PopPUNK/__main__.py | 55 ++++++++++++++++++++++++++++++++++-----------
 PopPUNK/models.py   | 33 ++++++++++++++++++++++-----
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 7dff02f0..dbbea61a 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -114,15 +114,34 @@ def get_options():
 
     # model fitting
     modelGroup = parser.add_argument_group('Model fit options')
-    modelGroup.add_argument('--model-subsample', help='Number of pairwise distances used to fit model [default = 100000]',
-                            type=int, default=100000)
-    modelGroup.add_argument('--assign-subsample', help='Number of pairwise distances in each assignment batch [default = 5000]',
-                            type=int, default=5000)
-    modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
-    modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
-    modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
-                                                        'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
-    modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
+    modelGroup.add_argument('--model-subsample',
+                            help='Number of pairwise distances used to fit model [default = 100000]',
+                            type=int,
+                            default=100000)
+    modelGroup.add_argument('--assign-subsample',
+                            help='Number of pairwise distances in each assignment batch [default = 5000]',
+                            type=int,
+                            default=5000)
+    modelGroup.add_argument('--no-assign',
+                            help='Fit the model without assigning all points',
+                            default=False,
+                            action='store_true'))
+    modelGroup.add_argument('--K',
+                            help='Maximum number of mixture components [default = 2]',
+                            type=int,
+                            default=2)
+    modelGroup.add_argument('--D',
+                            help='Maximum number of clusters in DBSCAN fitting [default = 100]',
+                            type=int,
+                            default=100)
+    modelGroup.add_argument('--min-cluster-prop',
+                            help='Minimum proportion of points in a cluster '
+                                 'in DBSCAN fitting [default = 0.0001]',
+                            type=float,
+                            default=0.0001)
+    modelGroup.add_argument('--threshold',
+                            help='Cutoff if using --fit-model threshold',
+                            type=float)
 
     # model refinement
     refinementGroup = parser.add_argument_group('Refine model options')
@@ -496,14 +515,24 @@ def main():
         if args.fit_model:
             # Run DBSCAN model
             if args.fit_model == "dbscan":
-                model = DBSCANFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample)
+                model = DBSCANFit(output,
+                                  max_samples = args.model_subsample,
+                                  max_batch_size = args.assign_subsample,
+                                  assign_points = args.no_assign)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model)
+                assignments = model.fit(distMat,
+                                        args.D,
+                                        args.min_cluster_prop,
+                                        args.gpu_model)
             # Run Gaussian model
             elif args.fit_model == "bgmm":
-                model = BGMMFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample)
+                model = BGMMFit(output,
+                                max_samples = args.model_subsample,
+                                max_batch_size = args.assign_subsample,
+                                assign_points = args.no_assign)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.K)
+                assignments = model.fit(distMat,
+                                        args.K)
             elif args.fit_model == "refine":
                 new_model = RefineFit(output)
                 new_model.set_threads(args.threads)
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index d2d8f2aa..f084373f 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -293,12 +293,13 @@ class BGMMFit(ClusterFit):
             (default = 100000)
     '''
 
-    def __init__(self, outPrefix, max_samples = 100000, max_batch_size = 100000):
+    def __init__(self, outPrefix, max_samples = 100000, max_batch_size = 100000, assign_points = True):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'bgmm'
         self.preprocess = True
         self.max_samples = max_samples
         self.max_batch_size = max_batch_size
+        self.assign_points = assign_points
 
     def fit(self, X, max_components):
         '''Extends :func:`~ClusterFit.fit`
@@ -325,8 +326,12 @@ def fit(self, X, max_components):
         self.means = self.dpgmm.means_
         self.covariances = self.dpgmm.covariances_
         self.fitted = True
-
-        y = self.assign(X, max_batch_size = self.max_batch_size)
+        
+        # Allow for partial fitting that only assigns the subsample not the full set
+        if self.assign_points:
+            y = self.assign(X, max_batch_size = self.max_batch_size)
+        else:
+            y = self.assign(self.subsampled_X, max_batch_size = self.max_batch_size)
         self.within_label = findWithinLabel(self.means, y)
         self.between_label = findWithinLabel(self.means, y, 1)
         return y
@@ -472,12 +477,13 @@ class DBSCANFit(ClusterFit):
             (default = 100000)
     '''
 
-    def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_samples = 100000):
+    def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_samples = 100000, assign_points = True):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'dbscan'
         self.preprocess = True
         self.max_batch_size = max_batch_size
         self.max_samples = max_samples
+        self.assign_points = assign_points
         self.use_gpu = use_gpu # Updated below
 
     def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
@@ -598,7 +604,12 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
         elif not use_gpu:
             shutil.rmtree(cache_out)
 
-        y = self.assign(X, max_batch_size = self.max_batch_size, use_gpu = use_gpu)
+        # Allow for partial fitting that only assigns the subsample not the full set
+        if self.assign_points:
+            y = self.assign(X, max_batch_size = self.max_batch_size, use_gpu = use_gpu)
+        else:
+            y = self.assign(self.subsampled_X, max_batch_size = self.max_batch_size, use_gpu = use_gpu)
+
         return y
 
 
@@ -615,6 +626,7 @@ def save(self):
              maxs=self.cluster_maxs,
              mins=self.cluster_mins,
              scale=self.scale,
+             assign_points = self.assign_points,
              use_gpu=self.use_gpu)
             with open(self.outPrefix + "/" + os.path.basename(self.outPrefix) + '_fit.pkl', 'wb') as pickle_file:
                 pickle.dump([self.hdb, self.type], pickle_file)
@@ -639,7 +651,16 @@ def load(self, fit_npz, fit_obj):
         self.cluster_maxs = fit_npz['maxs']
         self.cluster_mins = fit_npz['mins']
         self.scale = fit_npz['scale']
-        self.use_gpu = fit_npz['use_gpu']
+        if 'use_gpu' in fit_npz.keys():
+            self.use_gpu = fit_npz['use_gpu']
+        else:
+            # Default for backwards compatibility
+            self.use_gpu = False
+        if 'assign_points' in fit_npz.keys():
+            self.assign_points = fit_npz['assign_points']
+        else:
+            # Default for backwards compatibility
+            self.assign_points = True
         self.fitted = True
 
 

From 6baf0a0cd67e85e5c2052b6eaf5a95528151e6e0 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 08:02:00 +0000
Subject: [PATCH 48/62] Fix bracket

---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index dbbea61a..bcb68bad 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -125,7 +125,7 @@ def get_options():
     modelGroup.add_argument('--no-assign',
                             help='Fit the model without assigning all points',
                             default=False,
-                            action='store_true'))
+                            action='store_true')
     modelGroup.add_argument('--K',
                             help='Maximum number of mixture components [default = 2]',
                             type=int,

From b5d04efe19c491f7b7617151a7632165b6635f58 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 08:37:12 +0000
Subject: [PATCH 49/62] Clarify no-assign use

---
 PopPUNK/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index bcb68bad..e78fbc0a 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -123,7 +123,7 @@ def get_options():
                             type=int,
                             default=5000)
     modelGroup.add_argument('--no-assign',
-                            help='Fit the model without assigning all points',
+                            help='Fit the model without assigning all points (only applies to BGMM and DBSCAN models)',
                             default=False,
                             action='store_true')
     modelGroup.add_argument('--K',
@@ -518,7 +518,7 @@ def main():
                 model = DBSCANFit(output,
                                   max_samples = args.model_subsample,
                                   max_batch_size = args.assign_subsample,
-                                  assign_points = args.no_assign)
+                                  assign_points = not args.no_assign)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat,
                                         args.D,
@@ -529,7 +529,7 @@ def main():
                 model = BGMMFit(output,
                                 max_samples = args.model_subsample,
                                 max_batch_size = args.assign_subsample,
-                                assign_points = args.no_assign)
+                                assign_points = not args.no_assign)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat,
                                         args.K)

From f85bc75c5740ce21eab58f3f86c6ab91eb31a5dd Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 08:40:40 +0000
Subject: [PATCH 50/62] Simplify DBSCAN fitting

---
 PopPUNK/models.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index f084373f..8711ed48 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -561,12 +561,6 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                       self.cluster_means[cp.array(i),] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])]
                       self.cluster_mins[cp.array(i),] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])]
                       self.cluster_maxs[cp.array(i),] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])]
-
-                  y = self.assign(self.subsampled_X,
-                                  no_scale=True,
-                                  progress=False,
-                                  max_batch_size = self.subsampled_X.shape[0],
-                                  use_gpu = True)
                   
               else:
                   # get within strain cluster
@@ -580,11 +574,12 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
                       self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])]
                       self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])]
 
-                  y = self.assign(self.subsampled_X,
-                                  no_scale=True,
-                                  progress=False,
-                                  max_batch_size = self.subsampled_X.shape[0],
-                                  use_gpu = False)
+              # Run assignment
+              y = self.assign(self.subsampled_X,
+                              no_scale=True,
+                              progress=False,
+                              max_batch_size = self.subsampled_X.shape[0],
+                              use_gpu = use_gpu)
               
               # Evaluate clustering
               self.within_label = findWithinLabel(self.cluster_means, y)

From 94a88d970cdb59cb59bf8e971addb8a636d07863 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 08:50:25 +0000
Subject: [PATCH 51/62] Exit before network construction if not assigning

---
 PopPUNK/__main__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index e78fbc0a..1fc016c3 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -591,6 +591,10 @@ def main():
         else:
             assignments = model.assign(distMat)
 
+        # end here if not assigning data
+        if args.no_assign:
+            sys.exit(0)
+
         #******************************#
         #*                            *#
         #* network construction       *#

From 00b865109eb1d8d84924db731cd1ebbb2290c6ab Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 09:06:48 +0000
Subject: [PATCH 52/62] Do not allow assignment with an incompletely fitting
 model

---
 PopPUNK/assign.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index be1b848d..f601c9b5 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -408,6 +408,11 @@ def assign_query_hdf5(dbFuncs,
         raise RuntimeError("lineage models cannot be used with --serial")
     model.set_threads(threads)
 
+    # Only proceed with a fully-fitted model
+    if not self.fitted or (hasattr(self,'assign_points') and self.assign_points == False):
+        sys.stderr.write('Cannot assign points with an incompletely-fitted model\n')
+        sys.exit(1)
+
     # Set directories of previous fit
     if previous_clustering is not None:
         prev_clustering = previous_clustering

From 65035ba20baf4881140e9a003bbd43781bb7eccf Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 09:20:59 +0000
Subject: [PATCH 53/62] Fix model assignment checks

---
 PopPUNK/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 8711ed48..be9562d5 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -800,6 +800,7 @@ def __init__(self, outPrefix):
         self.slope = 2
         self.threshold = False
         self.unconstrained = False
+        self.assign_points = True
 
     def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False,
             unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False,

From bc2ba191255ca0f0b2201da852e800457001c39a Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 16 Feb 2024 09:28:34 +0000
Subject: [PATCH 54/62] Correct attribute checking

---
 PopPUNK/assign.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index f601c9b5..6f898b17 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -409,7 +409,7 @@ def assign_query_hdf5(dbFuncs,
     model.set_threads(threads)
 
     # Only proceed with a fully-fitted model
-    if not self.fitted or (hasattr(self,'assign_points') and self.assign_points == False):
+    if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False):
         sys.stderr.write('Cannot assign points with an incompletely-fitted model\n')
         sys.exit(1)
 

From 0382227c5041279c843c63a3e774eed3735f9e4d Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 23 Feb 2024 09:44:41 +0000
Subject: [PATCH 55/62] Improve CLI

---
 PopPUNK/__main__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 1fc016c3..05d37355 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -122,8 +122,8 @@ def get_options():
                             help='Number of pairwise distances in each assignment batch [default = 5000]',
                             type=int,
                             default=5000)
-    modelGroup.add_argument('--no-assign',
-                            help='Fit the model without assigning all points (only applies to BGMM and DBSCAN models)',
+    modelGroup.add_argument('--for-refine',
+                            help='Fit a BGMM or DBSCAN model without assigning all points to initialise a refined model',
                             default=False,
                             action='store_true')
     modelGroup.add_argument('--K',
@@ -518,7 +518,7 @@ def main():
                 model = DBSCANFit(output,
                                   max_samples = args.model_subsample,
                                   max_batch_size = args.assign_subsample,
-                                  assign_points = not args.no_assign)
+                                  assign_points = not args.for_refine)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat,
                                         args.D,
@@ -529,7 +529,7 @@ def main():
                 model = BGMMFit(output,
                                 max_samples = args.model_subsample,
                                 max_batch_size = args.assign_subsample,
-                                assign_points = not args.no_assign)
+                                assign_points = not args.for_refine)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat,
                                         args.K)
@@ -592,7 +592,7 @@ def main():
             assignments = model.assign(distMat)
 
         # end here if not assigning data
-        if args.no_assign:
+        if args.for_refine:
             sys.exit(0)
 
         #******************************#

From 843f7be5febeeee7be308f21f97fb947b7466d52 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 23 Feb 2024 09:56:54 +0000
Subject: [PATCH 56/62] Add reassuring exit message

---
 PopPUNK/__main__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 05d37355..30daff34 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -593,6 +593,7 @@ def main():
 
         # end here if not assigning data
         if args.for_refine:
+            sys.stderr.write('Initial model fit complete; points will be assigned when this model is refined\nusing "--fit-model refine"\n')
             sys.exit(0)
 
         #******************************#

From 3f1ea312fd58874b133b3c77750c0ccfdaf4886e Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 23 Feb 2024 09:58:26 +0000
Subject: [PATCH 57/62] Improve error message

---
 PopPUNK/assign.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 6f898b17..da8adc87 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -410,7 +410,7 @@ def assign_query_hdf5(dbFuncs,
 
     # Only proceed with a fully-fitted model
     if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False):
-        sys.stderr.write('Cannot assign points with an incompletely-fitted model\n')
+        sys.stderr.write('Cannot assign points with an incompletely-fitted model\nPlease refine this initial fit with "--fit-model refine"\n')
         sys.exit(1)
 
     # Set directories of previous fit

From 5454f7a92d7428b65d34240cae505e19eb800fb6 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 23 Feb 2024 10:53:24 +0000
Subject: [PATCH 58/62] Fix awful min_sample condition

---
 PopPUNK/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index be9562d5..6067c545 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -513,7 +513,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
 
         # DBSCAN parameters
         cache_out = "./" + self.outPrefix + "_cache"
-        min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023)
+        min_samples = min(int(min_cluster_prop * self.subsampled_X.shape[0]), 1023)
         min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10)
 
         # Check on initialisation of GPU libraries and memory

From f721ca1edffb3012e71d480a5fbca4c2a2378db3 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 23 Feb 2024 11:12:21 +0000
Subject: [PATCH 59/62] Clarify awful min_sample condition

---
 PopPUNK/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 6067c545..832a27ee 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -513,7 +513,8 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False):
 
         # DBSCAN parameters
         cache_out = "./" + self.outPrefix + "_cache"
-        min_samples = min(int(min_cluster_prop * self.subsampled_X.shape[0]), 1023)
+        min_samples = max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10) # do not allow clusters of < 10 points
+        min_samples = min(min_samples,1023) # do not allow clusters to require more than 1023 points at the start
         min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10)
 
         # Check on initialisation of GPU libraries and memory

From 9a229c67bf31b78ae7ecc249de9ba5892ed355e9 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 27 Feb 2024 10:54:58 +0000
Subject: [PATCH 60/62] Add tests

---
 test/run_test.py | 1 +
 test/test-gpu.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index 5f26a2f8..fa14c96c 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -31,6 +31,7 @@
 
 #fit dbscan
 sys.stderr.write("Running DBSCAN model fit (--fit-model dbscan)\n")
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights --for-refine", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights", shell=True, check=True)
 
 #refine model with GMM
diff --git a/test/test-gpu.py b/test/test-gpu.py
index 58af75c2..d7e003ca 100755
--- a/test/test-gpu.py
+++ b/test/test-gpu.py
@@ -31,6 +31,10 @@
 
 #fit dbscan
 sys.stderr.write("Running DBSCAN model fit (--fit-model dbscan)\n")
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model --for-refine", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model --assign-subsample 1000", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model --model-subsample 5000", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights --gpu-graph", shell=True, check=True)
 
 #refine model with GMM

From 6795025be0c25c72ff99a5731954f5c28169f7b5 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 27 Feb 2024 11:01:14 +0000
Subject: [PATCH 61/62] Update docs with GPU fitting info

---
 docs/gpu.rst | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/gpu.rst b/docs/gpu.rst
index a1559d2f..2b31215c 100644
--- a/docs/gpu.rst
+++ b/docs/gpu.rst
@@ -111,4 +111,15 @@ Using a GPU
 By default, PopPUNK will use not use GPUs. To use them, you will need to add
 the flag ``--gpu-sketch`` (when constructing or querying a database using reads),
 ``--gpu-dist`` (when constructing or querying a database from assemblies or reads),
-or ``--gpu-graph`` (when querying or visualising a database, or fitting a model).
+``--gpu-model`` (when fitting a DBSCAN model on the GPU), or ``--gpu-graph``
+(when querying or visualising a database, or fitting a model).
+
+Note that fitting a model with a GPU is fast, even with a large subsample of points,
+but may be limited by the memory of the GPU device. Therefore it is recommended that
+either the model is only fitted to a subsample of points, resulting in an incomplete
+model fit that must then be refined before use (option ``--for-refine``), or that
+the transfer of data between CPU and GPU is optimised using the ``--assign-subsample``
+variable. Larger values of ``--assign-subsample`` will result in fewer batches being
+transferred to the GPU memory, speeding up the process, but also increasing the risks
+that your device will run out of memory, particularly if a large, complex model object
+is already being stored on the device.

From 2920b004ed83c2fa808bf80e2408426b5a769698 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Tue, 27 Feb 2024 16:17:25 +0000
Subject: [PATCH 62/62] Bump version

---
 PopPUNK/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
index 34bca61d..7fe643a0 100644
--- a/PopPUNK/__init__.py
+++ b/PopPUNK/__init__.py
@@ -3,7 +3,7 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.6.3'
+__version__ = '2.6.4'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 2