From ea9229bcf7dff506fe2843a67f27f646aa9da583 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 17 Jan 2024 14:26:20 +0000 Subject: [PATCH 01/62] Allow changes to model fitting subsample --- PopPUNK/__main__.py | 6 ++++-- PopPUNK/models.py | 4 +--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 2fe144dc..ca64fbec 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -114,6 +114,8 @@ def get_options(): # model fitting modelGroup = parser.add_argument_group('Model fit options') + modelGroup.add_argument('--model-subsample', help='Number of pairwise distances used to fit model [default = 100000]', + type=int, default=100000) modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2) modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100) modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster ' @@ -491,12 +493,12 @@ def main(): if args.fit_model: # Run DBSCAN model if args.fit_model == "dbscan": - model = DBSCANFit(output) + model = DBSCANFit(output, max_samples = args.model_subsample) model.set_threads(args.threads) assignments = model.fit(distMat, args.D, args.min_cluster_prop) # Run Gaussian model elif args.fit_model == "bgmm": - model = BGMMFit(output) + model = BGMMFit(output, max_samples = args.model_subsample) model.set_threads(args.threads) assignments = model.fit(distMat, args.K) elif args.fit_model == "refine": diff --git a/PopPUNK/models.py b/PopPUNK/models.py index d51f83f4..44262f1b 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -289,11 +289,10 @@ class BGMMFit(ClusterFit): The output prefix used for reading/writing max_samples (int) The number of subsamples to fit the model to - (default = 100000) ''' - def __init__(self, outPrefix, max_samples = 50000): + def __init__(self, outPrefix, max_samples = 100000): ClusterFit.__init__(self, outPrefix) self.type = 'bgmm' self.preprocess = True @@ -469,7 +468,6 @@ class DBSCANFit(ClusterFit): The output prefix used for reading/writing max_samples (int) The number of subsamples to fit the model to - (default = 100000) ''' From a4d98af7d13771fcf79c6ef7da7b58aca4778af1 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 17 Jan 2024 15:34:53 +0000 Subject: [PATCH 02/62] Add GPU HDBSCAN algorithm --- PopPUNK/__main__.py | 3 ++- PopPUNK/dbscan.py | 40 ++++++++++++++++++++++++++++++++-------- PopPUNK/models.py | 10 ++++++++-- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index ca64fbec..75b7884c 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -182,6 +182,7 @@ def get_options(): other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]') other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]') other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') + other.add_argument('--gpu-model', default=False, action='store_true', help='Use a GPU when fitting a model [default = False]') other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets', @@ -495,7 +496,7 @@ def main(): if args.fit_model == "dbscan": model = DBSCANFit(output, max_samples = args.model_subsample) model.set_threads(args.threads) - assignments = model.fit(distMat, args.D, args.min_cluster_prop) + assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model) # Run Gaussian model elif args.fit_model == "bgmm": model = BGMMFit(output, max_samples = args.model_subsample) diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py index 2ab580c1..a85e19e4 100644 --- a/PopPUNK/dbscan.py +++ b/PopPUNK/dbscan.py @@ -9,7 +9,9 @@ # hdbscan import hdbscan -def fitDbScan(X, min_samples, min_cluster_size, cache_out): +from .utils import check_and_set_gpu + +def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): """Function to fit DBSCAN model as an alternative to the Gaussian Fits the DBSCAN model to the distances using hdbscan @@ -23,6 +25,8 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out): Minimum number of points in a cluster for HDBSCAN cache_out (str) Prefix for DBSCAN cache used for refitting + use_gpu (bool) + Whether GPU algorithms should be used in DBSCAN fitting Returns: hdb (hdbscan.HDBSCAN) @@ -32,14 +36,34 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out): n_clusters (int) Number of clusters used """ + # Check on initialisation of GPU libraries and memory + try: + import cudf + from cuml import cluster + gpu_lib = True + except ImportError as e: + gpu_lib = False + # check on GPU + use_gpu = check_and_set_gpu(use_gpu, + gpu_lib, + quit_on_fail = True) # set DBSCAN clustering parameters - hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree', - min_samples = min_samples, - #core_dist_n_jobs = threads, # may cause error, see #19 - memory = cache_out, - prediction_data = True, - min_cluster_size = min_cluster_size - ).fit(X) + if use_gpu: + sys.stderr.write('Fitting HDBSCAN model using a GPU') + hdb = cluster.HDBSCAN(min_samples = min_samples, + #core_dist_n_jobs = threads, # may cause error, see #19 + prediction_data = True, + min_cluster_size = min_cluster_size + ).fit(X) + else: + sys.stderr.write('Fitting HDBSCAN model using a CPU') + hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree', + min_samples = min_samples, + #core_dist_n_jobs = threads, # may cause error, see #19 + memory = cache_out, + prediction_data = True, + min_cluster_size = min_cluster_size + ).fit(X) # Number of clusters in labels, ignoring noise if present. labels = hdb.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 44262f1b..67114ea7 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -478,7 +478,7 @@ def __init__(self, outPrefix, max_samples = 100000): self.max_samples = max_samples - def fit(self, X, max_num_clusters, min_cluster_prop): + def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): '''Extends :func:`~ClusterFit.fit` Fits the distances with HDBSCAN and returns assignments by calling @@ -494,6 +494,8 @@ def fit(self, X, max_num_clusters, min_cluster_prop): Maximum number of clusters in DBSCAN fitting min_cluster_prop (float) Minimum proportion of points in a cluster in DBSCAN fitting + use_gpu (bool) + Whether GPU algorithms should be used in DBSCAN fitting Returns: y (numpy.array) @@ -508,7 +510,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop): indistinct_clustering = True while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10: - self.hdb, self.labels, self.n_clusters = fitDbScan(self.subsampled_X, min_samples, min_cluster_size, cache_out) + self.hdb, self.labels, self.n_clusters = fitDbScan(self.subsampled_X, + min_samples, + min_cluster_size, + cache_out, + use_gpu = use_gpu) self.fitted = True # needed for predict # Test whether model fit contains distinct clusters From d30297410b7a9c2d5121fd35a797933dfcd6f5ae Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 13:42:46 +0000 Subject: [PATCH 03/62] Update GPU memory management --- PopPUNK/dbscan.py | 11 ++++++----- PopPUNK/utils.py | 6 ++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py index a85e19e4..2f8b3d81 100644 --- a/PopPUNK/dbscan.py +++ b/PopPUNK/dbscan.py @@ -29,7 +29,7 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): Whether GPU algorithms should be used in DBSCAN fitting Returns: - hdb (hdbscan.HDBSCAN) + hdb (hdbscan.HDBSCAN or cuml.cluster.HDBSCAN) Fitted HDBSCAN to subsampled data labels (list) Cluster assignments of each sample @@ -40,6 +40,7 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): try: import cudf from cuml import cluster + import cupy gpu_lib = True except ImportError as e: gpu_lib = False @@ -49,14 +50,14 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): quit_on_fail = True) # set DBSCAN clustering parameters if use_gpu: - sys.stderr.write('Fitting HDBSCAN model using a GPU') - hdb = cluster.HDBSCAN(min_samples = min_samples, - #core_dist_n_jobs = threads, # may cause error, see #19 + sys.stderr.write('Fitting HDBSCAN model using a GPU\n') + hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples, + output_type = 'cupy', prediction_data = True, min_cluster_size = min_cluster_size ).fit(X) else: - sys.stderr.write('Fitting HDBSCAN model using a CPU') + sys.stderr.write('Fitting HDBSCAN model using a CPU\n') hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree', min_samples = min_samples, #core_dist_n_jobs = threads, # may cause error, see #19 diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 647d0203..ebc1948e 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -8,6 +8,7 @@ import sys # additional import pickle +import multiprocessing from collections import defaultdict from itertools import chain from tempfile import mkstemp @@ -572,11 +573,12 @@ def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False): # Set memory management for large networks if use_gpu: + multiprocessing.set_start_method('spawn', force=True) rmm.reinitialize(managed_memory=True) if "cupy" in sys.modules: - cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) + cupy.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator) if "cuda" in sys.modules: - cuda.set_memory_manager(rmm.RMMNumbaManager) + cuda.set_memory_manager(rmm.allocators.numba.RMMNumbaManager) assert(rmm.is_initialized()) return use_gpu From de8b76ff9906056d3a05116220d6c9b0eaa35334 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 13:49:21 +0000 Subject: [PATCH 04/62] Update min samples limit --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 67114ea7..09eea472 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -505,7 +505,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): # DBSCAN parameters cache_out = "./" + self.outPrefix + "_cache" - min_samples = max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10) + min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023) min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10) indistinct_clustering = True From edce43141b358544affa1f9a21a753ba38f6b2b8 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 14:19:31 +0000 Subject: [PATCH 05/62] Enable GPU model assignments --- PopPUNK/models.py | 120 ++++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 47 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 09eea472..5efab492 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -39,6 +39,7 @@ import cupy as cp from numba import cuda import rmm + from cuml import cluster except ImportError: pass @@ -519,22 +520,42 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): # Test whether model fit contains distinct clusters if self.n_clusters > 1 and self.n_clusters <= max_num_clusters: - # get within strain cluster - self.max_cluster_num = self.labels.max() - self.cluster_means = np.full((self.n_clusters,2),0.0,dtype=float) - self.cluster_mins = np.full((self.n_clusters,2),0.0,dtype=float) - self.cluster_maxs = np.full((self.n_clusters,2),0.0,dtype=float) - - for i in range(self.max_cluster_num+1): - self.cluster_means[i,] = [np.mean(self.subsampled_X[self.labels==i,0]),np.mean(self.subsampled_X[self.labels==i,1])] - self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])] - self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])] - - y = self.assign(self.subsampled_X, no_scale=True, progress=False) - self.within_label = findWithinLabel(self.cluster_means, y) - self.between_label = findBetweenLabel(y, self.within_label) - - indistinct_clustering = evaluate_dbscan_clusters(self) + + if use_gpu: + # get within strain cluster + self.max_cluster_num = self.labels.max() + self.cluster_means = cupy.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_mins = cupy.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_maxs = cupy.full((self.n_clusters,2),0.0,dtype=float) + + for i in range(self.max_cluster_num+1): + self.cluster_means[i,] = [cupy.mean(self.subsampled_X[self.labels==i,0]),cupy.mean(self.subsampled_X[self.labels==i,1])] + self.cluster_mins[i,] = [cupy.min(self.subsampled_X[self.labels==i,0]),cupy.min(self.subsampled_X[self.labels==i,1])] + self.cluster_maxs[i,] = [cupy.max(self.subsampled_X[self.labels==i,0]),cupy.max(self.subsampled_X[self.labels==i,1])] + + y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) + self.within_label = findWithinLabel(self.cluster_means, y) + self.between_label = findBetweenLabel(y, self.within_label) + + indistinct_clustering = evaluate_dbscan_clusters(self) + + else: + # get within strain cluster + self.max_cluster_num = self.labels.max() + self.cluster_means = np.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_mins = np.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_maxs = np.full((self.n_clusters,2),0.0,dtype=float) + + for i in range(self.max_cluster_num+1): + self.cluster_means[i,] = [np.mean(self.subsampled_X[self.labels==i,0]),np.mean(self.subsampled_X[self.labels==i,1])] + self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])] + self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])] + + y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = False) + self.within_label = findWithinLabel(self.cluster_means, y) + self.between_label = findBetweenLabel(y, self.within_label) + + indistinct_clustering = evaluate_dbscan_clusters(self) # Alter minimum cluster size criterion if min_cluster_size < min_samples / 2: @@ -624,7 +645,7 @@ def plot(self, X=None, y=None): self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan") - def assign(self, X, no_scale = False, progress = True): + def assign(self, X, no_scale = False, progress = True, use_gpu = False): '''Assign the clustering of new samples using :func:`~PopPUNK.dbscan.assign_samples_dbscan` Args: @@ -632,12 +653,13 @@ def assign(self, X, no_scale = False, progress = True): Core and accessory distances no_scale (bool) Do not scale X - [default = False] progress (bool) Show progress bar - [default = True] + use_gpu (bool) + Use GPU-enabled algorithms for clustering + [default = False] Returns: y (numpy.array) Cluster assignments by samples @@ -652,34 +674,38 @@ def assign(self, X, no_scale = False, progress = True): if progress: sys.stderr.write("Assigning distances with DBSCAN model\n") - y = np.zeros(X.shape[0], dtype=int) - block_size = 5000 - n_blocks = (X.shape[0] - 1) // block_size + 1 - with SharedMemoryManager() as smm: - shm_X = smm.SharedMemory(size = X.nbytes) - X_shared_array = np.ndarray(X.shape, dtype = X.dtype, buffer = shm_X.buf) - X_shared_array[:] = X[:] - X_shared = NumpyShared(name = shm_X.name, shape = X.shape, dtype = X.dtype) - - shm_y = smm.SharedMemory(size = y.nbytes) - y_shared_array = np.ndarray(y.shape, dtype = y.dtype, buffer = shm_y.buf) - y_shared_array[:] = y[:] - y_shared = NumpyShared(name = shm_y.name, shape = y.shape, dtype = y.dtype) - - tqdm.set_lock(RLock()) - process_map(partial(assign_samples, - X = X_shared, - y = y_shared, - model = self, - scale = scale, - chunk_size = block_size, - values = False), - range(n_blocks), - max_workers=self.threads, - chunksize=min(10, max(1, n_blocks // self.threads)), - disable=(progress == False)) - - y[:] = y_shared_array[:] + if use_gpu: + X_assignments = cluster.hdbscan.approximate_predict(self.hdb, X) + y = X_assignments.labels + else: + y = np.zeros(X.shape[0], dtype=int) + block_size = 5000 + n_blocks = (X.shape[0] - 1) // block_size + 1 + with SharedMemoryManager() as smm: + shm_X = smm.SharedMemory(size = X.nbytes) + X_shared_array = np.ndarray(X.shape, dtype = X.dtype, buffer = shm_X.buf) + X_shared_array[:] = X[:] + X_shared = NumpyShared(name = shm_X.name, shape = X.shape, dtype = X.dtype) + + shm_y = smm.SharedMemory(size = y.nbytes) + y_shared_array = np.ndarray(y.shape, dtype = y.dtype, buffer = shm_y.buf) + y_shared_array[:] = y[:] + y_shared = NumpyShared(name = shm_y.name, shape = y.shape, dtype = y.dtype) + + tqdm.set_lock(RLock()) + process_map(partial(assign_samples, + X = X_shared, + y = y_shared, + model = self, + scale = scale, + chunk_size = block_size, + values = False), + range(n_blocks), + max_workers=self.threads, + chunksize=min(10, max(1, n_blocks // self.threads)), + disable=(progress == False)) + + y[:] = y_shared_array[:] return y From ab859c3664d64ccdf49f265edd96518bdee012d4 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 14:29:24 +0000 Subject: [PATCH 06/62] Cluster counting with cupy --- PopPUNK/dbscan.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py index 2f8b3d81..4c7adeed 100644 --- a/PopPUNK/dbscan.py +++ b/PopPUNK/dbscan.py @@ -40,7 +40,7 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): try: import cudf from cuml import cluster - import cupy + import cupy as cp gpu_lib = True except ImportError as e: gpu_lib = False @@ -56,6 +56,9 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): prediction_data = True, min_cluster_size = min_cluster_size ).fit(X) + # Number of clusters in labels, ignoring noise if present. + labels = hdb.labels_ + n_clusters = len(cp.unique(labels[labels>-1])) else: sys.stderr.write('Fitting HDBSCAN model using a CPU\n') hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree', @@ -65,9 +68,9 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): prediction_data = True, min_cluster_size = min_cluster_size ).fit(X) - # Number of clusters in labels, ignoring noise if present. - labels = hdb.labels_ - n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + # Number of clusters in labels, ignoring noise if present. + labels = hdb.labels_ + n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # return model parameters return hdb, labels, n_clusters From d314fd8b01217f5f5f3634e8129cff4255efb951 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 14:34:17 +0000 Subject: [PATCH 07/62] Fix cupy abbreviation --- PopPUNK/models.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 5efab492..6757b598 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -524,14 +524,14 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): if use_gpu: # get within strain cluster self.max_cluster_num = self.labels.max() - self.cluster_means = cupy.full((self.n_clusters,2),0.0,dtype=float) - self.cluster_mins = cupy.full((self.n_clusters,2),0.0,dtype=float) - self.cluster_maxs = cupy.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float) + self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float) for i in range(self.max_cluster_num+1): - self.cluster_means[i,] = [cupy.mean(self.subsampled_X[self.labels==i,0]),cupy.mean(self.subsampled_X[self.labels==i,1])] - self.cluster_mins[i,] = [cupy.min(self.subsampled_X[self.labels==i,0]),cupy.min(self.subsampled_X[self.labels==i,1])] - self.cluster_maxs[i,] = [cupy.max(self.subsampled_X[self.labels==i,0]),cupy.max(self.subsampled_X[self.labels==i,1])] + self.cluster_means[i,] = [cp.mean(self.subsampled_X[self.labels==i,0]),cp.mean(self.subsampled_X[self.labels==i,1])] + self.cluster_mins[i,] = [cp.min(self.subsampled_X[self.labels==i,0]),cp.min(self.subsampled_X[self.labels==i,1])] + self.cluster_maxs[i,] = [cp.max(self.subsampled_X[self.labels==i,0]),cp.max(self.subsampled_X[self.labels==i,1])] y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) self.within_label = findWithinLabel(self.cluster_means, y) From 36856779963e6940fc40af01a83f884c6924161b Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 16:13:39 +0000 Subject: [PATCH 08/62] Get max from cupy array --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 6757b598..98a9fbd5 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -523,7 +523,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): if use_gpu: # get within strain cluster - self.max_cluster_num = self.labels.max() + self.max_cluster_num = int(self.labels.amax()[0]) self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float) self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float) self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float) From 6c031eb0c802875d9686e0b7f6b50752ff8a3e14 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 16:18:05 +0000 Subject: [PATCH 09/62] Change max function --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 98a9fbd5..b7b91db1 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -523,7 +523,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): if use_gpu: # get within strain cluster - self.max_cluster_num = int(self.labels.amax()[0]) + self.max_cluster_num = int(self.labels.max()[0]) self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float) self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float) self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float) From 7b7d30b19b484ac6a34fca049c02868b23b93aa6 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 16:22:52 +0000 Subject: [PATCH 10/62] Change cast to int --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index b7b91db1..4f0b2c52 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -523,7 +523,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): if use_gpu: # get within strain cluster - self.max_cluster_num = int(self.labels.max()[0]) + self.max_cluster_num = int(self.labels.max()) self.cluster_means = cp.full((self.n_clusters,2),0.0,dtype=float) self.cluster_mins = cp.full((self.n_clusters,2),0.0,dtype=float) self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float) From aa8d68669df474027e780c0ae35ac2212e540b8e Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 16:35:34 +0000 Subject: [PATCH 11/62] Convert data to cupy --- PopPUNK/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 4f0b2c52..ae86ff8a 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -509,8 +509,13 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023) min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10) + # Convert to cupy if using GPU to avoid implicit numpy conversion below + if use_gpu: + self.subsampled_X = cp.asarray(self.subsampled_X.shape) + indistinct_clustering = True while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10: + # Fit model self.hdb, self.labels, self.n_clusters = fitDbScan(self.subsampled_X, min_samples, min_cluster_size, From 54b5ab07862651c590a8f9b4cab1d96fbb086a97 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 20:33:00 +0000 Subject: [PATCH 12/62] Relax constraint on DBSCAN fits --- PopPUNK/dbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py index 4c7adeed..5cd582e2 100644 --- a/PopPUNK/dbscan.py +++ b/PopPUNK/dbscan.py @@ -98,7 +98,7 @@ def evaluate_dbscan_clusters(model): # evaluate whether maxima of cluster nearest origin do not # overlap with minima of cluster furthest from origin - if core_minimum_of_between > core_maximum_of_within and \ + if core_minimum_of_between > core_maximum_of_within or \ accessory_minimum_of_between > accessory_maximum_of_within: indistinct = False From 78336b7f5479d454691d702343b5e4aed0c12471 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 18 Jan 2024 20:33:10 +0000 Subject: [PATCH 13/62] Simplify fitting code --- PopPUNK/models.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index ae86ff8a..e7dadc83 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -539,10 +539,6 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_maxs[i,] = [cp.max(self.subsampled_X[self.labels==i,0]),cp.max(self.subsampled_X[self.labels==i,1])] y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) - self.within_label = findWithinLabel(self.cluster_means, y) - self.between_label = findBetweenLabel(y, self.within_label) - - indistinct_clustering = evaluate_dbscan_clusters(self) else: # get within strain cluster @@ -557,10 +553,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])] y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = False) - self.within_label = findWithinLabel(self.cluster_means, y) - self.between_label = findBetweenLabel(y, self.within_label) - - indistinct_clustering = evaluate_dbscan_clusters(self) + + # Evaluate clustering + self.within_label = findWithinLabel(self.cluster_means, y) + self.between_label = findBetweenLabel(y, self.within_label) + indistinct_clustering = evaluate_dbscan_clusters(self) # Alter minimum cluster size criterion if min_cluster_size < min_samples / 2: From 26e951103e617f4031e6e08ddbfda9482e789634 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 10:26:43 +0000 Subject: [PATCH 14/62] Manage GPU memory --- PopPUNK/dbscan.py | 14 ++------------ PopPUNK/models.py | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py index 5cd582e2..4b073247 100644 --- a/PopPUNK/dbscan.py +++ b/PopPUNK/dbscan.py @@ -36,20 +36,10 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False): n_clusters (int) Number of clusters used """ - # Check on initialisation of GPU libraries and memory - try: - import cudf - from cuml import cluster - import cupy as cp - gpu_lib = True - except ImportError as e: - gpu_lib = False - # check on GPU - use_gpu = check_and_set_gpu(use_gpu, - gpu_lib, - quit_on_fail = True) # set DBSCAN clustering parameters if use_gpu: + from cuml import cluster + import cupy as cp sys.stderr.write('Fitting HDBSCAN model using a GPU\n') hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples, output_type = 'cupy', diff --git a/PopPUNK/models.py b/PopPUNK/models.py index e7dadc83..c5a5edb3 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -509,9 +509,22 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023) min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10) + # Check on initialisation of GPU libraries and memory # Convert to cupy if using GPU to avoid implicit numpy conversion below if use_gpu: - self.subsampled_X = cp.asarray(self.subsampled_X.shape) + try: + import cudf + from cuml import cluster + import cupy as cp + gpu_lib = True + except ImportError as e: + gpu_lib = False + # check on GPU + use_gpu = check_and_set_gpu(use_gpu, + gpu_lib, + quit_on_fail = True) + if use_gpu: + self.subsampled_X = cp.asarray(self.subsampled_X) indistinct_clustering = True while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10: @@ -534,9 +547,10 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_maxs = cp.full((self.n_clusters,2),0.0,dtype=float) for i in range(self.max_cluster_num+1): - self.cluster_means[i,] = [cp.mean(self.subsampled_X[self.labels==i,0]),cp.mean(self.subsampled_X[self.labels==i,1])] - self.cluster_mins[i,] = [cp.min(self.subsampled_X[self.labels==i,0]),cp.min(self.subsampled_X[self.labels==i,1])] - self.cluster_maxs[i,] = [cp.max(self.subsampled_X[self.labels==i,0]),cp.max(self.subsampled_X[self.labels==i,1])] + labelled_rows = cp.where(self.labels==i,True,False) + self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,0]),cp.mean(self.subsampled_X[labelled_rows,1])] + self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,0]),cp.min(self.subsampled_X[labelled_rows,1])] + self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,0]),cp.max(self.subsampled_X[labelled_rows,1])] y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) From c5ab3d8ecafdb3002c945a60e9366298d81c8dc0 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 10:39:30 +0000 Subject: [PATCH 15/62] Cupy indexing fix --- PopPUNK/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index c5a5edb3..009a0e5d 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -548,9 +548,9 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): for i in range(self.max_cluster_num+1): labelled_rows = cp.where(self.labels==i,True,False) - self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,0]),cp.mean(self.subsampled_X[labelled_rows,1])] - self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,0]),cp.min(self.subsampled_X[labelled_rows,1])] - self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,0]),cp.max(self.subsampled_X[labelled_rows,1])] + self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])] + self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])] + self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])] y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) From 50ac74515eff005163d6d73c01d04dbb27598f1f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 10:44:37 +0000 Subject: [PATCH 16/62] Cupy indexing fix again --- PopPUNK/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 009a0e5d..886c1e9c 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -548,9 +548,9 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): for i in range(self.max_cluster_num+1): labelled_rows = cp.where(self.labels==i,True,False) - self.cluster_means[i,] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])] - self.cluster_mins[i,] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])] - self.cluster_maxs[i,] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])] + self.cluster_means[cp.array(i),] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])] + self.cluster_mins[cp.array(i),] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])] + self.cluster_maxs[cp.array(i),] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])] y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) From 507cfa3f35310e84819ae71cf7edb7af95c6f74a Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 10:55:53 +0000 Subject: [PATCH 17/62] Fix prediction parsing --- PopPUNK/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 886c1e9c..50eedd95 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -691,8 +691,7 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False): sys.stderr.write("Assigning distances with DBSCAN model\n") if use_gpu: - X_assignments = cluster.hdbscan.approximate_predict(self.hdb, X) - y = X_assignments.labels + y, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X) else: y = np.zeros(X.shape[0], dtype=int) block_size = 5000 From da70f27814fbcd79a6094c0d38dc51349aefcf0d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:01:07 +0000 Subject: [PATCH 18/62] Ignore CPU only cache --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 50eedd95..d4ffad66 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -583,7 +583,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.fitted = False sys.stderr.write("Failed to find distinct clusters in this dataset\n") sys.exit(1) - else: + else if not use_gpu: shutil.rmtree(cache_out) y = self.assign(X) From 5651d2f7b8ed60ba556247b50a850351f6fdddbc Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:01:50 +0000 Subject: [PATCH 19/62] Use python --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index d4ffad66..707f12d7 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -583,7 +583,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.fitted = False sys.stderr.write("Failed to find distinct clusters in this dataset\n") sys.exit(1) - else if not use_gpu: + elif not use_gpu: shutil.rmtree(cache_out) y = self.assign(X) From b5abfdc26e91d04bc4c15cd1945276c8b2814e05 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:17:58 +0000 Subject: [PATCH 20/62] Assign by GPU in blocks --- PopPUNK/models.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 707f12d7..d8f9655f 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -691,7 +691,15 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False): sys.stderr.write("Assigning distances with DBSCAN model\n") if use_gpu: - y, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X) + y = np.zeros(X.shape[0], dtype=int) + block_size = 500000 + n_blocks = (X.shape[0] - 1) // block_size + 1 + for block in range(n_blocks): + start_index = block*block_size + end_index = max((block+1)*block_size,X.shape[0]) + sys.stderr.write("Processing rows " + str(start_index) + " to " + str(end_index) + "\n") + y_assignments, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X[start_index:end_index,]) + y[start_index:end_index] = cp.asnumpy(y_assignments) else: y = np.zeros(X.shape[0], dtype=int) block_size = 5000 From aafaa1cbeb7d8bac6fefb6ce1b9d2f9786cf02df Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:24:48 +0000 Subject: [PATCH 21/62] Specify GPU assignment --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index d8f9655f..6e47218c 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -586,7 +586,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): elif not use_gpu: shutil.rmtree(cache_out) - y = self.assign(X) + y = self.assign(X, use_gpu = use_gpu) return y From e6cbf901c7fa8c8e82fec2e3e9c89d79521e5504 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:30:52 +0000 Subject: [PATCH 22/62] Improve indices --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 6e47218c..aa05b325 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -696,7 +696,7 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False): n_blocks = (X.shape[0] - 1) // block_size + 1 for block in range(n_blocks): start_index = block*block_size - end_index = max((block+1)*block_size,X.shape[0]) + end_index = min((block+1)*block_size-1,X.shape[0]) sys.stderr.write("Processing rows " + str(start_index) + " to " + str(end_index) + "\n") y_assignments, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X[start_index:end_index,]) y[start_index:end_index] = cp.asnumpy(y_assignments) From 539d939bed1aa3d3cd72d38499bd9b54f1c835ad Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:37:55 +0000 Subject: [PATCH 23/62] Allow flexible assignment batch size --- PopPUNK/models.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index aa05b325..5849eb4e 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -586,7 +586,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): elif not use_gpu: shutil.rmtree(cache_out) - y = self.assign(X, use_gpu = use_gpu) + y = self.assign(X, max_batch_size = max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu) return y @@ -661,7 +661,7 @@ def plot(self, X=None, y=None): self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan") - def assign(self, X, no_scale = False, progress = True, use_gpu = False): + def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False): '''Assign the clustering of new samples using :func:`~PopPUNK.dbscan.assign_samples_dbscan` Args: @@ -673,6 +673,9 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False): progress (bool) Show progress bar [default = True] + max_batch_size (int) + Batch size used for GPU assignments + [default = 5000] use_gpu (bool) Use GPU-enabled algorithms for clustering [default = False] @@ -692,7 +695,7 @@ def assign(self, X, no_scale = False, progress = True, use_gpu = False): if use_gpu: y = np.zeros(X.shape[0], dtype=int) - block_size = 500000 + block_size = max_batch_size n_blocks = (X.shape[0] - 1) // block_size + 1 for block in range(n_blocks): start_index = block*block_size From 176b3230f58731cde3ed31984d06abc71dfec28a Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 11:39:17 +0000 Subject: [PATCH 24/62] Argument typo --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 5849eb4e..b68e6679 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -586,7 +586,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): elif not use_gpu: shutil.rmtree(cache_out) - y = self.assign(X, max_batch_size = max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu) + y = self.assign(X, max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu) return y From 8ebbb8936913ae2235690df665b13c95aee0a7a4 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 19 Jan 2024 12:06:51 +0000 Subject: [PATCH 25/62] Fix assignment command --- PopPUNK/models.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index b68e6679..498b8111 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -552,7 +552,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_mins[cp.array(i),] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])] self.cluster_maxs[cp.array(i),] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])] - y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = True) + y = self.assign(self.subsampled_X, + no_scale=True, + progress=False, + max_batch_size = self.subsampled_X.shape[0], + use_gpu = True) else: # get within strain cluster @@ -586,7 +590,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): elif not use_gpu: shutil.rmtree(cache_out) - y = self.assign(X, max_match_size = self.subsampled_X.shape[0], use_gpu = use_gpu) + y = self.assign(X, max_batch_size = self.subsampled_X.shape[0], use_gpu = use_gpu) return y From 8511ae8241cc9cb2bfe62cafaff0856a77131287 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 21 Jan 2024 13:08:24 +0000 Subject: [PATCH 26/62] Separate fitting and assigning batch size --- PopPUNK/__main__.py | 4 +++- PopPUNK/models.py | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 75b7884c..f53d6635 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -116,6 +116,8 @@ def get_options(): modelGroup = parser.add_argument_group('Model fit options') modelGroup.add_argument('--model-subsample', help='Number of pairwise distances used to fit model [default = 100000]', type=int, default=100000) + modelGroup.add_argument('--assign-subsample', help='Number of pairwise distances in each assignment batch [default = 5000]', + type=int, default=5000) modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2) modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100) modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster ' @@ -494,7 +496,7 @@ def main(): if args.fit_model: # Run DBSCAN model if args.fit_model == "dbscan": - model = DBSCANFit(output, max_samples = args.model_subsample) + model = DBSCANFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample) model.set_threads(args.threads) assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model) # Run Gaussian model diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 498b8111..1a0a3ca8 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -678,7 +678,7 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us Show progress bar [default = True] max_batch_size (int) - Batch size used for GPU assignments + Batch size used for assignments [default = 5000] use_gpu (bool) Use GPU-enabled algorithms for clustering @@ -696,10 +696,12 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us scale = self.scale if progress: sys.stderr.write("Assigning distances with DBSCAN model\n") - + + # Set block size + block_size = max_batch_size + if use_gpu: y = np.zeros(X.shape[0], dtype=int) - block_size = max_batch_size n_blocks = (X.shape[0] - 1) // block_size + 1 for block in range(n_blocks): start_index = block*block_size @@ -709,7 +711,6 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us y[start_index:end_index] = cp.asnumpy(y_assignments) else: y = np.zeros(X.shape[0], dtype=int) - block_size = 5000 n_blocks = (X.shape[0] - 1) // block_size + 1 with SharedMemoryManager() as smm: shm_X = smm.SharedMemory(size = X.nbytes) From 03aef56019d150738bc4561ad3cfc9e9ace380e5 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 21 Jan 2024 13:18:55 +0000 Subject: [PATCH 27/62] Parse batch size --- PopPUNK/models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 1a0a3ca8..64826282 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -472,10 +472,11 @@ class DBSCANFit(ClusterFit): (default = 100000) ''' - def __init__(self, outPrefix, max_samples = 100000): + def __init__(self, outPrefix, max_batch_size = 5000, max_samples = 100000): ClusterFit.__init__(self, outPrefix) self.type = 'dbscan' self.preprocess = True + self.max_batch_size = max_batch_size self.max_samples = max_samples @@ -590,7 +591,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): elif not use_gpu: shutil.rmtree(cache_out) - y = self.assign(X, max_batch_size = self.subsampled_X.shape[0], use_gpu = use_gpu) + y = self.assign(X, max_batch_size = self.max_batch_size, use_gpu = use_gpu) return y From fadc54da30869292ff6327caf71c49de018ce94b Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 21 Jan 2024 17:30:13 +0000 Subject: [PATCH 28/62] Use cupy in plot function --- PopPUNK/models.py | 7 ++++--- PopPUNK/plot.py | 9 +++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 64826282..9c8fde61 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -472,13 +472,13 @@ class DBSCANFit(ClusterFit): (default = 100000) ''' - def __init__(self, outPrefix, max_batch_size = 5000, max_samples = 100000): + def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_samples = 100000): ClusterFit.__init__(self, outPrefix) self.type = 'dbscan' self.preprocess = True self.max_batch_size = max_batch_size self.max_samples = max_samples - + self.use_gpu = use_gpu def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): '''Extends :func:`~ClusterFit.fit` @@ -663,7 +663,8 @@ def plot(self, X=None, y=None): plot_dbscan_results(self.subsampled_X * self.scale, self.assign(self.subsampled_X, no_scale=True, progress=False), self.n_clusters, - self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan") + self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan", + self.use_gpu) def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False): diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index faf5ac37..22e639b3 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -182,7 +182,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix): plt.savefig(out_prefix + ".png") plt.close() -def plot_dbscan_results(X, y, n_clusters, out_prefix): +def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu): """Draw a scatter plot (png) to show the DBSCAN model fit A scatter plot of core and accessory distances, coloured by component @@ -197,10 +197,15 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix): Number of clusters used (excluding noise) out_prefix (str) Prefix for output file (.png will be appended) + use_gpu (bool) + Whether model was fitted with GPU or not """ # Black removed and is used for noise instead. unique_labels = set(y) - colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] # changed to work with two clusters + if use_gpu: + colours = [plt.cm.Spectral(each) for each in cp.linspace(0, 1, len(unique_labels))] # changed to work with two clusters + else: + colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] # changed to work with two clusters fig=plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k') for k in unique_labels: From bceaa691977930d46b578011b94306a592227d95 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 21 Jan 2024 19:38:15 +0000 Subject: [PATCH 29/62] Convert to numpy before plotting --- PopPUNK/models.py | 13 ++++++++++--- PopPUNK/plot.py | 9 ++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 9c8fde61..82cb2338 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -478,7 +478,7 @@ def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_sample self.preprocess = True self.max_batch_size = max_batch_size self.max_samples = max_samples - self.use_gpu = use_gpu + self.use_gpu = use_gpu # Updated below def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): '''Extends :func:`~ClusterFit.fit` @@ -525,7 +525,10 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): gpu_lib, quit_on_fail = True) if use_gpu: + self.use_gpu = True self.subsampled_X = cp.asarray(self.subsampled_X) + else + self.use_gpu = False indistinct_clustering = True while indistinct_clustering and min_cluster_size >= min_samples and min_samples >= 10: @@ -660,11 +663,15 @@ def plot(self, X=None, y=None): sys.stderr.write("\t" + str(centre) + "\n") sys.stderr.write("\n") + # Convert to numpy for plotting + if self.use_gpu: + import cupy as cp + self.subsampled_X = cp.asnumpy(self.scale) + plot_dbscan_results(self.subsampled_X * self.scale, self.assign(self.subsampled_X, no_scale=True, progress=False), self.n_clusters, - self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan", - self.use_gpu) + self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan") def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False): diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 22e639b3..faf5ac37 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -182,7 +182,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix): plt.savefig(out_prefix + ".png") plt.close() -def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu): +def plot_dbscan_results(X, y, n_clusters, out_prefix): """Draw a scatter plot (png) to show the DBSCAN model fit A scatter plot of core and accessory distances, coloured by component @@ -197,15 +197,10 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu): Number of clusters used (excluding noise) out_prefix (str) Prefix for output file (.png will be appended) - use_gpu (bool) - Whether model was fitted with GPU or not """ # Black removed and is used for noise instead. unique_labels = set(y) - if use_gpu: - colours = [plt.cm.Spectral(each) for each in cp.linspace(0, 1, len(unique_labels))] # changed to work with two clusters - else: - colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] # changed to work with two clusters + colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] # changed to work with two clusters fig=plt.figure(figsize=(11, 8), dpi= 160, facecolor='w', edgecolor='k') for k in unique_labels: From a19940364f05d84c8ba00cd8d2c4132477759e98 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 21 Jan 2024 19:39:12 +0000 Subject: [PATCH 30/62] Add colon --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 82cb2338..2aa4df9a 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -527,7 +527,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): if use_gpu: self.use_gpu = True self.subsampled_X = cp.asarray(self.subsampled_X) - else + else: self.use_gpu = False indistinct_clustering = True From 20c7bb6dc2a173d5d75c22c37cdf78a299fcc40d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 21 Jan 2024 19:47:41 +0000 Subject: [PATCH 31/62] Specify add GPU --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 2aa4df9a..02220492 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -669,7 +669,7 @@ def plot(self, X=None, y=None): self.subsampled_X = cp.asnumpy(self.scale) plot_dbscan_results(self.subsampled_X * self.scale, - self.assign(self.subsampled_X, no_scale=True, progress=False), + self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu), self.n_clusters, self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan") From 0f04894b5b820e97a0da1748a74ce46f042dc66c Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 22 Jan 2024 05:15:37 +0000 Subject: [PATCH 32/62] Move cupy conversion --- PopPUNK/models.py | 8 ++------ PopPUNK/plot.py | 10 +++++++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 02220492..87f68fa7 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -663,15 +663,11 @@ def plot(self, X=None, y=None): sys.stderr.write("\t" + str(centre) + "\n") sys.stderr.write("\n") - # Convert to numpy for plotting - if self.use_gpu: - import cupy as cp - self.subsampled_X = cp.asnumpy(self.scale) - plot_dbscan_results(self.subsampled_X * self.scale, self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu), self.n_clusters, - self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan") + self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan", + self.use_gpu) def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, use_gpu = False): diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index faf5ac37..a0f5d226 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -182,7 +182,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix): plt.savefig(out_prefix + ".png") plt.close() -def plot_dbscan_results(X, y, n_clusters, out_prefix): +def plot_dbscan_results(X, y, n_clusters, out_prefix, use_gpu): """Draw a scatter plot (png) to show the DBSCAN model fit A scatter plot of core and accessory distances, coloured by component @@ -197,7 +197,15 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix): Number of clusters used (excluding noise) out_prefix (str) Prefix for output file (.png will be appended) + use_gpu (bool) + Whether model was fitted with GPU-enabled code """ + # Convert data if from GPU + if use_gpu: + # Convert to numpy for plotting + import cupy as cp + X = cp.asnumpy(X) + # Black removed and is used for noise instead. unique_labels = set(y) colours = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] # changed to work with two clusters From bf5d65714304e93edf951b901817baa87525909b Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 22 Jan 2024 05:25:35 +0000 Subject: [PATCH 33/62] Add cupy scaling --- PopPUNK/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 87f68fa7..a177d909 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -663,6 +663,11 @@ def plot(self, X=None, y=None): sys.stderr.write("\t" + str(centre) + "\n") sys.stderr.write("\n") + # Harmonise scales + if self.use_gpu: + import cupy as cp + self.scale = cp.asarray(self.scale) + plot_dbscan_results(self.subsampled_X * self.scale, self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu), self.n_clusters, From 3fe3529253c870478724566309371270eb34cc3d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 16:29:29 +0000 Subject: [PATCH 34/62] Update use of cugraph functions --- PopPUNK/network.py | 55 +++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 244e9e89..b6da9a1b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -225,7 +225,7 @@ def translate_network_indices(G_ref_df, reference_indices): # Translate network indices to match name order G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()] G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()] - G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) + G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True) return(G_ref) def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None, @@ -881,7 +881,7 @@ def construct_network_from_df(rlist, use_weights = False if weights: use_weights = True - G = add_self_loop(G_df, max_in_vertex_labels, weights = use_weights, renumber = False) + G = generate_cugraph(G_df, max_in_vertex_labels, weights = use_weights, renumber = False) else: # Convert bool to list of weights or None if weights: @@ -1009,7 +1009,7 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp G_df['destination'] = [edge_list[0][1]] G_df['weights'] = weights max_in_vertex_labels = len(vertex_labels)-1 - G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + G = generate_cugraph(G_df, max_in_vertex_labels, weights = True, renumber = False) else: # Construct network with CPU via edge list weighted_edges = [] @@ -1125,8 +1125,9 @@ def get_cugraph_triangles(G): """ nlen = G.number_of_vertices() df = G.view_edge_list() + df = df.drop(df[df.source == df.destination].index) A = cp.full((nlen, nlen), 0, dtype = cp.int32) - A[df.src.values, df.dst.values] = 1 + A[df.source.values, df.destination.values] = 1 A = cp.maximum( A, A.transpose() ) triangle_count = int(cp.around(cp.trace(cp.matmul(A, cp.matmul(A, A)))/6,0)) return triangle_count @@ -1155,11 +1156,9 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) - # consistent with graph-tool for small graphs - triangle counts differ for large graphs - # could reflect issue https://github.com/rapidsai/cugraph/issues/1043 - # this command can be restored once the above issue is fixed - scheduled for cugraph 0.20 -# triangle_count = cugraph.community.triangle_count.triangles(G)/3 - triangle_count = 3*get_cugraph_triangles(G) + # need to check consistent with graph-tool + triangle_count = cugraph.triangle_count(G)/3 +# triangle_count = 3*get_cugraph_triangles(G) degree_df = G.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) @@ -1353,8 +1352,8 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, return G, qqDistMat -def add_self_loop(G_df, seq_num, weights = False, renumber = True): - """Adds self-loop to cugraph graph to ensure all nodes are included in +def generate_cugraph(G_df, seq_num, weights = False, renumber = True): + """Builds cugraph graph to ensure all nodes are included in the graph, even if singletons. Args: @@ -1370,28 +1369,20 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present - min_in_df = np.amin([G_df['source'].min(), G_df['destination'].min()]) - if min_in_df.item() > 0: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [0] - G_self_loop['destination'] = [0] - if weights: - G_self_loop['weights'] = 0.0 - G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - if max_in_df.item() != seq_num: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [seq_num] - G_self_loop['destination'] = [seq_num] - if weights: - G_self_loop['weights'] = 0.0 - G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) + node_indices = cudf.Series(range(seq_num), dtype = cp.int32) + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = node_indices + G_self_loop['destination'] = node_indices + G_self_loop['weights'] = 0.0 + # Build cudf + # Weights are needed to extract subgraphs + # see https://github.com/rapidsai/cugraph/blob/77d833ad/python/cugraph/cugraph/community/subgraph_extraction.py + if not weights: + G_df['weights'] = 1.0 + G_df = cudf.concat([G_self_loop,G_df], ignore_index = True) # Construct graph G_new = cugraph.Graph() - if weights: - G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber) - else: - G_new.from_cudf_edgelist(G_df, renumber = renumber) + G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber) return G_new @@ -1847,7 +1838,7 @@ def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): G_df['destination'] = sparse_mat.col G_df['weights'] = sparse_mat.data max_in_vertex_labels = len(rlist)-1 - G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + G = generate_cugraph(G_df, max_in_vertex_labels, weights = True, renumber = False) else: connections = [] for (src,dst) in zip(sparse_mat.row,sparse_mat.col): From e7dba83a1a0c655f17cc5c7f80f77f1078f70206 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 16:32:21 +0000 Subject: [PATCH 35/62] Update function name --- PopPUNK/refine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 3239bb0a..70b2e7ca 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -42,7 +42,7 @@ from .network import construct_network_from_df, printClusters from .network import construct_network_from_edge_list from .network import networkSummary -from .network import add_self_loop +from .network import generate_cugraph from .utils import transformLine from .utils import decisionBoundary From 89e5ab8a870fb808d0f272bf21244d4271a126bd Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 16:52:29 +0000 Subject: [PATCH 36/62] Update network functions --- PopPUNK/network.py | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b6da9a1b..2cd48207 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -678,7 +678,7 @@ def construct_network_from_edge_list(rlist, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): - """Construct an undirected network using a data frame of edges. Nodes are samples and + """Construct an undirected network using a list of edges as tuples. Nodes are samples and edges where samples are within the same cluster Will print summary statistics about the network to ``STDERR`` @@ -688,8 +688,8 @@ def construct_network_from_edge_list(rlist, List of reference sequence labels qlist (list) List of query sequence labels - G_df (cudf or pandas data frame) - Data frame in which the first two columns are the nodes linked by edges + edge_list (list of tuples) + List of tuples describing the edges of the graph weights (list) List of edge weights distMat (2 column ndarray) @@ -735,7 +735,7 @@ def construct_network_from_edge_list(rlist, else: # Cannot generate an array when one edge G_df = cudf.DataFrame(columns = ['source','destination']) - G_df['source'] = [edge_list[0][0]] + G_df['source'] = [edge_list[0][0]] # This does not work G_df['destination'] = [edge_list[0][1]] if weights is not None: G_df['weights'] = weights @@ -1110,28 +1110,6 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = return G -def get_cugraph_triangles(G): - """Counts the number of triangles in a cugraph - network. Can be removed when the cugraph issue - https://github.com/rapidsai/cugraph/issues/1043 is fixed. - - Args: - G (cugraph network) - Network to be analysed - - Returns: - triangle_count (int) - Count of triangles in graph - """ - nlen = G.number_of_vertices() - df = G.view_edge_list() - df = df.drop(df[df.source == df.destination].index) - A = cp.full((nlen, nlen), 0, dtype = cp.int32) - A[df.source.values, df.destination.values] = 1 - A = cp.maximum( A, A.transpose() ) - triangle_count = int(cp.around(cp.trace(cp.matmul(A, cp.matmul(A, A)))/6,0)) - return triangle_count - def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sample_default, use_gpu = False): """Provides summary values about the network @@ -1157,8 +1135,8 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa components = len(component_nums) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) # need to check consistent with graph-tool - triangle_count = cugraph.triangle_count(G)/3 -# triangle_count = 3*get_cugraph_triangles(G) + triangle_counts = cugraph.triangle_count(G) + triangle_count = triangle_counts['counts'].sum()/3 degree_df = G.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) From 2931541f19096a0374588a5ab9d02add11d14b10 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 17:45:07 +0000 Subject: [PATCH 37/62] Improve error reporting --- PopPUNK/network.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2cd48207..3df5719b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -162,6 +162,8 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): networkMissing = set(set(range(len(seq_list))).difference(vertex_list)) if len(networkMissing) > 0: sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n") + if len(networkMissing) < 10: + sys.stderr.write('Missing isolate are: ' + ' '.join(networkMissing)) sys.exit(1) def getCliqueRefs(G, reference_indices = set()): @@ -732,11 +734,14 @@ def construct_network_from_edge_list(rlist, edge_array = cp.array(edge_list, dtype = np.int32) edge_gpu_matrix = cuda.to_device(edge_array) G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) - else: + elif len(edge_list) == 1: # Cannot generate an array when one edge G_df = cudf.DataFrame(columns = ['source','destination']) - G_df['source'] = [edge_list[0][0]] # This does not work + G_df['source'] = [edge_list[0][0]] G_df['destination'] = [edge_list[0][1]] + else: + sys.stderr.write('ERROR: Missing link in graph assignment') + sys.exit() if weights is not None: G_df['weights'] = weights G = construct_network_from_df(rlist, qlist, G_df, From 3e923fdbf97c62ccff2fece902d4c8faf0f0e72d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 17:52:27 +0000 Subject: [PATCH 38/62] Fix reporting bug --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3df5719b..c1642183 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -163,7 +163,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): if len(networkMissing) > 0: sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n") if len(networkMissing) < 10: - sys.stderr.write('Missing isolate are: ' + ' '.join(networkMissing)) + sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing])) sys.exit(1) def getCliqueRefs(G, reference_indices = set()): From 72de66bde237959f93ed842a7153879acafb0200 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 19:10:31 +0000 Subject: [PATCH 39/62] More information on missing isolates --- PopPUNK/network.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c1642183..9b420a4e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -162,8 +162,11 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): networkMissing = set(set(range(len(seq_list))).difference(vertex_list)) if len(networkMissing) > 0: sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n") - if len(networkMissing) < 10: - sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing])) + if len(networkMissing) == 1: + sys.stderr.write('Missing isolate is: ' + seq_list[networkMissing[0]] + ' (index ' + str(networkMissing[0]) + ')') + elif len(networkMissing) < 10: + sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing])) + sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing])) sys.exit(1) def getCliqueRefs(G, reference_indices = set()): From 8cb9f5191c53f538a41b4b598f9046cf11177523 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 19:29:07 +0000 Subject: [PATCH 40/62] Fix set processing --- PopPUNK/network.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9b420a4e..4e0e5594 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -163,10 +163,11 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): if len(networkMissing) > 0: sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n") if len(networkMissing) == 1: - sys.stderr.write('Missing isolate is: ' + seq_list[networkMissing[0]] + ' (index ' + str(networkMissing[0]) + ')') + missing_isolate_index = networkMissing.pop() + sys.stderr.write('Missing isolate is: ' + seq_list[missing_isolate_index] + ' (index ' + str(missing_isolate_index) + ')') elif len(networkMissing) < 10: - sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing])) - sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing])) + sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing])) + sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing])) sys.exit(1) def getCliqueRefs(G, reference_indices = set()): From 8ef479031f2064d10d1eb6815a01776798fd8122 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 19:46:26 +0000 Subject: [PATCH 41/62] Avoid omitting nodes --- PopPUNK/network.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4e0e5594..28660dd2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1339,15 +1339,15 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, return G, qqDistMat -def generate_cugraph(G_df, seq_num, weights = False, renumber = True): +def generate_cugraph(G_df, max_index, weights = False, renumber = True): """Builds cugraph graph to ensure all nodes are included in the graph, even if singletons. Args: G_df (cudf) cudf data frame containing edge list - seq_num (int) - The expected number of nodes in the graph + max_index (int) + The 0-indexed maximum of the node indices renumber (bool) Whether to renumber the vertices when added to the graph @@ -1356,7 +1356,7 @@ def generate_cugraph(G_df, seq_num, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present - node_indices = cudf.Series(range(seq_num), dtype = cp.int32) + node_indices = cudf.Series(range(seq_num+1), dtype = cp.int32) G_self_loop = cudf.DataFrame() G_self_loop['source'] = node_indices G_self_loop['destination'] = node_indices From 2d458659125fd29aba5413f948393454ac106631 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 19:52:53 +0000 Subject: [PATCH 42/62] Fix variable name --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 28660dd2..f0839d02 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1356,7 +1356,7 @@ def generate_cugraph(G_df, max_index, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present - node_indices = cudf.Series(range(seq_num+1), dtype = cp.int32) + node_indices = cudf.Series(range(max_index+1), dtype = cp.int32) G_self_loop = cudf.DataFrame() G_self_loop['source'] = node_indices G_self_loop['destination'] = node_indices From e5f5b9a5e42bb5ddd56e44fa66d9a2ccb0ea3005 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 21:18:21 +0000 Subject: [PATCH 43/62] Add BGMM max batch size --- PopPUNK/__main__.py | 2 +- PopPUNK/models.py | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index f53d6635..7dff02f0 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -501,7 +501,7 @@ def main(): assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model) # Run Gaussian model elif args.fit_model == "bgmm": - model = BGMMFit(output, max_samples = args.model_subsample) + model = BGMMFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample) model.set_threads(args.threads) assignments = model.fit(distMat, args.K) elif args.fit_model == "refine": diff --git a/PopPUNK/models.py b/PopPUNK/models.py index a177d909..f84d2b3d 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -293,12 +293,12 @@ class BGMMFit(ClusterFit): (default = 100000) ''' - def __init__(self, outPrefix, max_samples = 100000): + def __init__(self, outPrefix, max_samples = 100000, max_batch_size = 100000): ClusterFit.__init__(self, outPrefix) self.type = 'bgmm' self.preprocess = True self.max_samples = max_samples - + self.max_batch_size = max_batch_size def fit(self, X, max_components): '''Extends :func:`~ClusterFit.fit` @@ -326,7 +326,7 @@ def fit(self, X, max_components): self.covariances = self.dpgmm.covariances_ self.fitted = True - y = self.assign(X) + y = self.assign(X, max_batch_size = self.max_batch_size) self.within_label = findWithinLabel(self.means, y) self.between_label = findWithinLabel(self.means, y, 1) return y @@ -384,7 +384,7 @@ def plot(self, X, y): if not hasattr(self, 'subsampled_X'): self.subsampled_X = utils.shuffle(X, random_state=random.randint(1,10000))[0:self.max_samples,] - y_subsample = self.assign(self.subsampled_X, values=True, progress=False) + y_subsample = self.assign(self.subsampled_X, max_batch_size = self.max_batch_size, values=True, progress=False) avg_entropy = np.mean(np.apply_along_axis(stats.entropy, 1, y_subsample)) used_components = np.unique(y).size @@ -402,7 +402,7 @@ def plot(self, X, y): plot_contours(self, y, title + " assignment boundary", outfile + "_contours") - def assign(self, X, values = False, progress=True): + def assign(self, X, max_batch_size = 100000, values = False, progress=True): '''Assign the clustering of new samples using :func:`~PopPUNK.bgmm.assign_samples` Args: @@ -410,8 +410,8 @@ def assign(self, X, values = False, progress=True): Core and accessory distances values (bool) Return the responsibilities of assignment rather than most likely cluster - num_processes (int) - Number of threads to use + max_batch_size (int) + Size of batches to be assigned progress (bool) Show progress bar @@ -430,7 +430,7 @@ def assign(self, X, values = False, progress=True): y = np.zeros((X.shape[0], len(self.weights)), dtype=X.dtype) else: y = np.zeros(X.shape[0], dtype=int) - block_size = 100000 + block_size = max_batch_size with SharedMemoryManager() as smm: shm_X = smm.SharedMemory(size = X.nbytes) X_shared_array = np.ndarray(X.shape, dtype = X.dtype, buffer = shm_X.buf) @@ -574,7 +574,11 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])] self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])] - y = self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu = False) + y = self.assign(self.subsampled_X, + no_scale=True, + progress=False, + max_batch_size = self.subsampled_X.shape[0], + use_gpu = False) # Evaluate clustering self.within_label = findWithinLabel(self.cluster_means, y) From 758e0b122fb8988e8f5a7e2de5d4aa20b806b0c1 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 23 Jan 2024 21:35:44 +0000 Subject: [PATCH 44/62] Update assignment in fitting --- PopPUNK/models.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index f84d2b3d..2be23e3b 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -673,7 +673,11 @@ def plot(self, X=None, y=None): self.scale = cp.asarray(self.scale) plot_dbscan_results(self.subsampled_X * self.scale, - self.assign(self.subsampled_X, no_scale=True, progress=False, use_gpu=self.use_gpu), + self.assign(self.subsampled_X, + max_batch_size = self.max_batch_size, + no_scale=True, + progress=False, + use_gpu=self.use_gpu), self.n_clusters, self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_dbscan", self.use_gpu) From 3c92279d9efb9d4388a05adf2acbeea8bad2a422 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 24 Jan 2024 06:12:24 +0000 Subject: [PATCH 45/62] GPU specification in save and load --- PopPUNK/models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 2be23e3b..b4d308f8 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -614,7 +614,8 @@ def save(self): means=self.cluster_means, maxs=self.cluster_maxs, mins=self.cluster_mins, - scale=self.scale) + scale=self.scale, + use_gpu=self.use_gpu) with open(self.outPrefix + "/" + os.path.basename(self.outPrefix) + '_fit.pkl', 'wb') as pickle_file: pickle.dump([self.hdb, self.type], pickle_file) @@ -637,6 +638,8 @@ def load(self, fit_npz, fit_obj): self.cluster_means = fit_npz['means'] self.cluster_maxs = fit_npz['maxs'] self.cluster_mins = fit_npz['mins'] + self.scale = fit_npz['scale'] + self.use_gpu = fit_npz['use_gpu'] self.fitted = True @@ -687,7 +690,7 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us '''Assign the clustering of new samples using :func:`~PopPUNK.dbscan.assign_samples_dbscan` Args: - X (numpy.array) + X (numpy.array or cupy.array) Core and accessory distances no_scale (bool) Do not scale X From e857fca3f729647d2f955e18e328f1ae1b5491e9 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 24 Jan 2024 06:38:07 +0000 Subject: [PATCH 46/62] Explicit numpy conversion in predict --- PopPUNK/models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index b4d308f8..d2d8f2aa 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -728,8 +728,11 @@ def assign(self, X, no_scale = False, progress = True, max_batch_size = 5000, us start_index = block*block_size end_index = min((block+1)*block_size-1,X.shape[0]) sys.stderr.write("Processing rows " + str(start_index) + " to " + str(end_index) + "\n") - y_assignments, y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, X[start_index:end_index,]) - y[start_index:end_index] = cp.asnumpy(y_assignments) + # cuml v24.02 always returns numpy therefore make conversion explicit + y[start_index:end_index], y_probabilities = cluster.hdbscan.approximate_predict(self.hdb, + X[start_index:end_index,], + convert_dtype = True) + del y_probabilities else: y = np.zeros(X.shape[0], dtype=int) n_blocks = (X.shape[0] - 1) // block_size + 1 From 101a910f6869962a8f60126e26f33d3785b016e6 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 08:00:49 +0000 Subject: [PATCH 47/62] Enable fitting without assigning all points --- PopPUNK/__main__.py | 55 ++++++++++++++++++++++++++++++++++----------- PopPUNK/models.py | 33 ++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 19 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 7dff02f0..dbbea61a 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -114,15 +114,34 @@ def get_options(): # model fitting modelGroup = parser.add_argument_group('Model fit options') - modelGroup.add_argument('--model-subsample', help='Number of pairwise distances used to fit model [default = 100000]', - type=int, default=100000) - modelGroup.add_argument('--assign-subsample', help='Number of pairwise distances in each assignment batch [default = 5000]', - type=int, default=5000) - modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2) - modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100) - modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster ' - 'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001) - modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float) + modelGroup.add_argument('--model-subsample', + help='Number of pairwise distances used to fit model [default = 100000]', + type=int, + default=100000) + modelGroup.add_argument('--assign-subsample', + help='Number of pairwise distances in each assignment batch [default = 5000]', + type=int, + default=5000) + modelGroup.add_argument('--no-assign', + help='Fit the model without assigning all points', + default=False, + action='store_true')) + modelGroup.add_argument('--K', + help='Maximum number of mixture components [default = 2]', + type=int, + default=2) + modelGroup.add_argument('--D', + help='Maximum number of clusters in DBSCAN fitting [default = 100]', + type=int, + default=100) + modelGroup.add_argument('--min-cluster-prop', + help='Minimum proportion of points in a cluster ' + 'in DBSCAN fitting [default = 0.0001]', + type=float, + default=0.0001) + modelGroup.add_argument('--threshold', + help='Cutoff if using --fit-model threshold', + type=float) # model refinement refinementGroup = parser.add_argument_group('Refine model options') @@ -496,14 +515,24 @@ def main(): if args.fit_model: # Run DBSCAN model if args.fit_model == "dbscan": - model = DBSCANFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample) + model = DBSCANFit(output, + max_samples = args.model_subsample, + max_batch_size = args.assign_subsample, + assign_points = args.no_assign) model.set_threads(args.threads) - assignments = model.fit(distMat, args.D, args.min_cluster_prop, args.gpu_model) + assignments = model.fit(distMat, + args.D, + args.min_cluster_prop, + args.gpu_model) # Run Gaussian model elif args.fit_model == "bgmm": - model = BGMMFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample) + model = BGMMFit(output, + max_samples = args.model_subsample, + max_batch_size = args.assign_subsample, + assign_points = args.no_assign) model.set_threads(args.threads) - assignments = model.fit(distMat, args.K) + assignments = model.fit(distMat, + args.K) elif args.fit_model == "refine": new_model = RefineFit(output) new_model.set_threads(args.threads) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index d2d8f2aa..f084373f 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -293,12 +293,13 @@ class BGMMFit(ClusterFit): (default = 100000) ''' - def __init__(self, outPrefix, max_samples = 100000, max_batch_size = 100000): + def __init__(self, outPrefix, max_samples = 100000, max_batch_size = 100000, assign_points = True): ClusterFit.__init__(self, outPrefix) self.type = 'bgmm' self.preprocess = True self.max_samples = max_samples self.max_batch_size = max_batch_size + self.assign_points = assign_points def fit(self, X, max_components): '''Extends :func:`~ClusterFit.fit` @@ -325,8 +326,12 @@ def fit(self, X, max_components): self.means = self.dpgmm.means_ self.covariances = self.dpgmm.covariances_ self.fitted = True - - y = self.assign(X, max_batch_size = self.max_batch_size) + + # Allow for partial fitting that only assigns the subsample not the full set + if self.assign_points: + y = self.assign(X, max_batch_size = self.max_batch_size) + else: + y = self.assign(self.subsampled_X, max_batch_size = self.max_batch_size) self.within_label = findWithinLabel(self.means, y) self.between_label = findWithinLabel(self.means, y, 1) return y @@ -472,12 +477,13 @@ class DBSCANFit(ClusterFit): (default = 100000) ''' - def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_samples = 100000): + def __init__(self, outPrefix, use_gpu = False, max_batch_size = 5000, max_samples = 100000, assign_points = True): ClusterFit.__init__(self, outPrefix) self.type = 'dbscan' self.preprocess = True self.max_batch_size = max_batch_size self.max_samples = max_samples + self.assign_points = assign_points self.use_gpu = use_gpu # Updated below def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): @@ -598,7 +604,12 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): elif not use_gpu: shutil.rmtree(cache_out) - y = self.assign(X, max_batch_size = self.max_batch_size, use_gpu = use_gpu) + # Allow for partial fitting that only assigns the subsample not the full set + if self.assign_points: + y = self.assign(X, max_batch_size = self.max_batch_size, use_gpu = use_gpu) + else: + y = self.assign(self.subsampled_X, max_batch_size = self.max_batch_size, use_gpu = use_gpu) + return y @@ -615,6 +626,7 @@ def save(self): maxs=self.cluster_maxs, mins=self.cluster_mins, scale=self.scale, + assign_points = self.assign_points, use_gpu=self.use_gpu) with open(self.outPrefix + "/" + os.path.basename(self.outPrefix) + '_fit.pkl', 'wb') as pickle_file: pickle.dump([self.hdb, self.type], pickle_file) @@ -639,7 +651,16 @@ def load(self, fit_npz, fit_obj): self.cluster_maxs = fit_npz['maxs'] self.cluster_mins = fit_npz['mins'] self.scale = fit_npz['scale'] - self.use_gpu = fit_npz['use_gpu'] + if 'use_gpu' in fit_npz.keys(): + self.use_gpu = fit_npz['use_gpu'] + else: + # Default for backwards compatibility + self.use_gpu = False + if 'assign_points' in fit_npz.keys(): + self.assign_points = fit_npz['assign_points'] + else: + # Default for backwards compatibility + self.assign_points = True self.fitted = True From 6baf0a0cd67e85e5c2052b6eaf5a95528151e6e0 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 08:02:00 +0000 Subject: [PATCH 48/62] Fix bracket --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index dbbea61a..bcb68bad 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -125,7 +125,7 @@ def get_options(): modelGroup.add_argument('--no-assign', help='Fit the model without assigning all points', default=False, - action='store_true')) + action='store_true') modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, From b5d04efe19c491f7b7617151a7632165b6635f58 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 08:37:12 +0000 Subject: [PATCH 49/62] Clarify no-assign use --- PopPUNK/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index bcb68bad..e78fbc0a 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -123,7 +123,7 @@ def get_options(): type=int, default=5000) modelGroup.add_argument('--no-assign', - help='Fit the model without assigning all points', + help='Fit the model without assigning all points (only applies to BGMM and DBSCAN models)', default=False, action='store_true') modelGroup.add_argument('--K', @@ -518,7 +518,7 @@ def main(): model = DBSCANFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample, - assign_points = args.no_assign) + assign_points = not args.no_assign) model.set_threads(args.threads) assignments = model.fit(distMat, args.D, @@ -529,7 +529,7 @@ def main(): model = BGMMFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample, - assign_points = args.no_assign) + assign_points = not args.no_assign) model.set_threads(args.threads) assignments = model.fit(distMat, args.K) From f85bc75c5740ce21eab58f3f86c6ab91eb31a5dd Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 08:40:40 +0000 Subject: [PATCH 50/62] Simplify DBSCAN fitting --- PopPUNK/models.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index f084373f..8711ed48 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -561,12 +561,6 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_means[cp.array(i),] = [cp.mean(self.subsampled_X[labelled_rows,cp.array([0])]),cp.mean(self.subsampled_X[labelled_rows,cp.array([1])])] self.cluster_mins[cp.array(i),] = [cp.min(self.subsampled_X[labelled_rows,cp.array([0])]),cp.min(self.subsampled_X[labelled_rows,cp.array([1])])] self.cluster_maxs[cp.array(i),] = [cp.max(self.subsampled_X[labelled_rows,cp.array([0])]),cp.max(self.subsampled_X[labelled_rows,cp.array([1])])] - - y = self.assign(self.subsampled_X, - no_scale=True, - progress=False, - max_batch_size = self.subsampled_X.shape[0], - use_gpu = True) else: # get within strain cluster @@ -580,11 +574,12 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): self.cluster_mins[i,] = [np.min(self.subsampled_X[self.labels==i,0]),np.min(self.subsampled_X[self.labels==i,1])] self.cluster_maxs[i,] = [np.max(self.subsampled_X[self.labels==i,0]),np.max(self.subsampled_X[self.labels==i,1])] - y = self.assign(self.subsampled_X, - no_scale=True, - progress=False, - max_batch_size = self.subsampled_X.shape[0], - use_gpu = False) + # Run assignment + y = self.assign(self.subsampled_X, + no_scale=True, + progress=False, + max_batch_size = self.subsampled_X.shape[0], + use_gpu = use_gpu) # Evaluate clustering self.within_label = findWithinLabel(self.cluster_means, y) From 94a88d970cdb59cb59bf8e971addb8a636d07863 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 08:50:25 +0000 Subject: [PATCH 51/62] Exit before network construction if not assigning --- PopPUNK/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index e78fbc0a..1fc016c3 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -591,6 +591,10 @@ def main(): else: assignments = model.assign(distMat) + # end here if not assigning data + if args.no_assign: + sys.exit(0) + #******************************# #* *# #* network construction *# From 00b865109eb1d8d84924db731cd1ebbb2290c6ab Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 09:06:48 +0000 Subject: [PATCH 52/62] Do not allow assignment with an incompletely fitting model --- PopPUNK/assign.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index be1b848d..f601c9b5 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -408,6 +408,11 @@ def assign_query_hdf5(dbFuncs, raise RuntimeError("lineage models cannot be used with --serial") model.set_threads(threads) + # Only proceed with a fully-fitted model + if not self.fitted or (hasattr(self,'assign_points') and self.assign_points == False): + sys.stderr.write('Cannot assign points with an incompletely-fitted model\n') + sys.exit(1) + # Set directories of previous fit if previous_clustering is not None: prev_clustering = previous_clustering From 65035ba20baf4881140e9a003bbd43781bb7eccf Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 09:20:59 +0000 Subject: [PATCH 53/62] Fix model assignment checks --- PopPUNK/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 8711ed48..be9562d5 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -800,6 +800,7 @@ def __init__(self, outPrefix): self.slope = 2 self.threshold = False self.unconstrained = False + self.assign_points = True def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False, unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False, From bc2ba191255ca0f0b2201da852e800457001c39a Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 16 Feb 2024 09:28:34 +0000 Subject: [PATCH 54/62] Correct attribute checking --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index f601c9b5..6f898b17 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -409,7 +409,7 @@ def assign_query_hdf5(dbFuncs, model.set_threads(threads) # Only proceed with a fully-fitted model - if not self.fitted or (hasattr(self,'assign_points') and self.assign_points == False): + if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False): sys.stderr.write('Cannot assign points with an incompletely-fitted model\n') sys.exit(1) From 0382227c5041279c843c63a3e774eed3735f9e4d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 23 Feb 2024 09:44:41 +0000 Subject: [PATCH 55/62] Improve CLI --- PopPUNK/__main__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 1fc016c3..05d37355 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -122,8 +122,8 @@ def get_options(): help='Number of pairwise distances in each assignment batch [default = 5000]', type=int, default=5000) - modelGroup.add_argument('--no-assign', - help='Fit the model without assigning all points (only applies to BGMM and DBSCAN models)', + modelGroup.add_argument('--for-refine', + help='Fit a BGMM or DBSCAN model without assigning all points to initialise a refined model', default=False, action='store_true') modelGroup.add_argument('--K', @@ -518,7 +518,7 @@ def main(): model = DBSCANFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample, - assign_points = not args.no_assign) + assign_points = not args.for_refine) model.set_threads(args.threads) assignments = model.fit(distMat, args.D, @@ -529,7 +529,7 @@ def main(): model = BGMMFit(output, max_samples = args.model_subsample, max_batch_size = args.assign_subsample, - assign_points = not args.no_assign) + assign_points = not args.for_refine) model.set_threads(args.threads) assignments = model.fit(distMat, args.K) @@ -592,7 +592,7 @@ def main(): assignments = model.assign(distMat) # end here if not assigning data - if args.no_assign: + if args.for_refine: sys.exit(0) #******************************# From 843f7be5febeeee7be308f21f97fb947b7466d52 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 23 Feb 2024 09:56:54 +0000 Subject: [PATCH 56/62] Add reassuring exit message --- PopPUNK/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 05d37355..30daff34 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -593,6 +593,7 @@ def main(): # end here if not assigning data if args.for_refine: + sys.stderr.write('Initial model fit complete; points will be assigned when this model is refined\nusing "--fit-model refine"\n') sys.exit(0) #******************************# From 3f1ea312fd58874b133b3c77750c0ccfdaf4886e Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 23 Feb 2024 09:58:26 +0000 Subject: [PATCH 57/62] Improve error message --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 6f898b17..da8adc87 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -410,7 +410,7 @@ def assign_query_hdf5(dbFuncs, # Only proceed with a fully-fitted model if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False): - sys.stderr.write('Cannot assign points with an incompletely-fitted model\n') + sys.stderr.write('Cannot assign points with an incompletely-fitted model\nPlease refine this initial fit with "--fit-model refine"\n') sys.exit(1) # Set directories of previous fit From 5454f7a92d7428b65d34240cae505e19eb800fb6 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 23 Feb 2024 10:53:24 +0000 Subject: [PATCH 58/62] Fix awful min_sample condition --- PopPUNK/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index be9562d5..6067c545 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -513,7 +513,7 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): # DBSCAN parameters cache_out = "./" + self.outPrefix + "_cache" - min_samples = min(max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10),1023) + min_samples = min(int(min_cluster_prop * self.subsampled_X.shape[0]), 1023) min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10) # Check on initialisation of GPU libraries and memory From f721ca1edffb3012e71d480a5fbca4c2a2378db3 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 23 Feb 2024 11:12:21 +0000 Subject: [PATCH 59/62] Clarify awful min_sample condition --- PopPUNK/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 6067c545..832a27ee 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -513,7 +513,8 @@ def fit(self, X, max_num_clusters, min_cluster_prop, use_gpu = False): # DBSCAN parameters cache_out = "./" + self.outPrefix + "_cache" - min_samples = min(int(min_cluster_prop * self.subsampled_X.shape[0]), 1023) + min_samples = max(int(min_cluster_prop * self.subsampled_X.shape[0]), 10) # do not allow clusters of < 10 points + min_samples = min(min_samples,1023) # do not allow clusters to require more than 1023 points at the start min_cluster_size = max(int(0.01 * self.subsampled_X.shape[0]), 10) # Check on initialisation of GPU libraries and memory From 9a229c67bf31b78ae7ecc249de9ba5892ed355e9 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 27 Feb 2024 10:54:58 +0000 Subject: [PATCH 60/62] Add tests --- test/run_test.py | 1 + test/test-gpu.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index 5f26a2f8..fa14c96c 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -31,6 +31,7 @@ #fit dbscan sys.stderr.write("Running DBSCAN model fit (--fit-model dbscan)\n") +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights --for-refine", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights", shell=True, check=True) #refine model with GMM diff --git a/test/test-gpu.py b/test/test-gpu.py index 58af75c2..d7e003ca 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -31,6 +31,10 @@ #fit dbscan sys.stderr.write("Running DBSCAN model fit (--fit-model dbscan)\n") +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model --for-refine", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model --assign-subsample 1000", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --gpu-graph --gpu-model --model-subsample 5000", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights --gpu-graph", shell=True, check=True) #refine model with GMM From 6795025be0c25c72ff99a5731954f5c28169f7b5 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 27 Feb 2024 11:01:14 +0000 Subject: [PATCH 61/62] Update docs with GPU fitting info --- docs/gpu.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/gpu.rst b/docs/gpu.rst index a1559d2f..2b31215c 100644 --- a/docs/gpu.rst +++ b/docs/gpu.rst @@ -111,4 +111,15 @@ Using a GPU By default, PopPUNK will use not use GPUs. To use them, you will need to add the flag ``--gpu-sketch`` (when constructing or querying a database using reads), ``--gpu-dist`` (when constructing or querying a database from assemblies or reads), -or ``--gpu-graph`` (when querying or visualising a database, or fitting a model). +``--gpu-model`` (when fitting a DBSCAN model on the GPU), or ``--gpu-graph`` +(when querying or visualising a database, or fitting a model). + +Note that fitting a model with a GPU is fast, even with a large subsample of points, +but may be limited by the memory of the GPU device. Therefore it is recommended that +either the model is only fitted to a subsample of points, resulting in an incomplete +model fit that must then be refined before use (option ``--for-refine``), or that +the transfer of data between CPU and GPU is optimised using the ``--assign-subsample`` +variable. Larger values of ``--assign-subsample`` will result in fewer batches being +transferred to the GPU memory, speeding up the process, but also increasing the risks +that your device will run out of memory, particularly if a large, complex model object +is already being stored on the device. From 2920b004ed83c2fa808bf80e2408426b5a769698 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 27 Feb 2024 16:17:25 +0000 Subject: [PATCH 62/62] Bump version --- PopPUNK/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 34bca61d..7fe643a0 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.6.3' +__version__ = '2.6.4' # Minimum sketchlib version SKETCHLIB_MAJOR = 2