From 357ae4c02049293e3a2d9781321fda0c5bce6526 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 30 Jan 2023 16:32:43 +0000 Subject: [PATCH 01/37] Move pairwise distance plot to db dir --- PopPUNK/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index f48eeaa3..443413bf 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -77,7 +77,7 @@ def plot_scatter(X, out_prefix, title, kde = True): plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') - plt.savefig(out_prefix + '.png') + plt.savefig(os.path.join(out_prefix, out_prefix + '.png')) plt.close() def plot_fit(klist, raw_matching, raw_fit, corrected_matching, corrected_fit, out_prefix, title): From d3843356d9107192919bd0d00053cf64da3a5da0 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 30 Jan 2023 16:58:16 +0000 Subject: [PATCH 02/37] Add db evaluation histograms --- PopPUNK/__main__.py | 4 ++- PopPUNK/plot.py | 59 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 2fe144dc..6960dc11 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -240,6 +240,7 @@ def main(): from .plot import writeClusterCsv from .plot import plot_scatter + from .plot import plot_database_evaluations from .qc import prune_distance_matrix, qcDistMat, sketchlibAssemblyQC, remove_qc_fail @@ -363,8 +364,9 @@ def main(): # Plot results if not args.no_plot: plot_scatter(distMat, - f"{args.output}/{os.path.basename(args.output)}_distanceDistribution", + args.output, args.output + " distances") + plot_database_evaluations(args.output) #******************************# #* *# diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 443413bf..8a832e0f 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -15,6 +15,7 @@ import itertools # for other outputs import pandas as pd +import h5py from collections import defaultdict from sklearn import utils try: # sklearn >= 0.22 @@ -77,7 +78,63 @@ def plot_scatter(X, out_prefix, title, kde = True): plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') - plt.savefig(os.path.join(out_prefix, out_prefix + '.png')) + plt.savefig(os.path.join(out_prefix, os.path.basename(out_prefix) + '_distanceDistribution.png')) + plt.close() + +def plot_database_evaluations(prefix): + """Plot histograms of sequence characteristics for database evaluation. + + Args: + prefix (str) + Prefix of database + """ + db_file = prefix + "/" + os.path.basename(prefix) + ".h5" + ref_db = h5py.File(db_file, 'r') + + genome_lengths = [] + ambiguous_bases = [] + for sample_name in list(ref_db['sketches'].keys()): + genome_lengths.append(ref_db['sketches/' + sample_name].attrs['length']) + ambiguous_bases.append(ref_db['sketches/' + sample_name].attrs['missing_bases']) + plot_evaluation_histogram(genome_lengths, + n_bins = 100, + prefix = prefix, + suffix = 'genome_lengths', + plt_title = 'Distribution of sequence lengths', + xlab = 'Sequence length (nt)') + plot_evaluation_histogram(ambiguous_bases, + n_bins = 100, + prefix = prefix, + suffix = 'ambiguous_base_counts', + plt_title = 'Distribution of ambiguous base counts', + xlab = 'Number of ambiguous bases') + +def plot_evaluation_histogram(input_data, n_bins = 100, prefix = 'hist', + suffix = '', plt_title = 'histogram', xlab = 'x'): + """Plot histograms of sequence characteristics for database evaluation. + + Args: + input_data (list) + Input data (list of numbers) + n_bins (int) + Number of bins to use for the histogram + prefix (str) + Prefix of database + suffix (str) + Suffix specifying plot type + plt_title (str) + Title for plot + xlab (str) + Title for the horizontal axis + """ + plt.figure(figsize=(8, 8), dpi=160, facecolor='w', edgecolor='k') + counts, bins = np.histogram(input_data, bins = n_bins) + plt.stairs(counts, bins, fill = True) + plt.title(plt_title) + plt.xlabel(xlab) + plt.ylabel('Frequency') + plt.savefig(os.path.join(out_prefix, os.path.basename(out_prefix) + suffix + '.png')) + plt.savefig(os.path.join(prefix,prefix + '.png')) plt.close() def plot_fit(klist, raw_matching, raw_fit, corrected_matching, corrected_fit, out_prefix, title): From 17cb94798be5a86e1b29ed12d99986a3a65fd678 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 30 Jan 2023 17:01:46 +0000 Subject: [PATCH 03/37] Correct variable name --- PopPUNK/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 8a832e0f..b651e0e8 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -133,7 +133,7 @@ def plot_evaluation_histogram(input_data, n_bins = 100, prefix = 'hist', plt.title(plt_title) plt.xlabel(xlab) plt.ylabel('Frequency') - plt.savefig(os.path.join(out_prefix, os.path.basename(out_prefix) + suffix + '.png')) + plt.savefig(os.path.join(prefix, os.path.basename(prefix) + suffix + '.png')) plt.savefig(os.path.join(prefix,prefix + '.png')) plt.close() From 325171cdaf2f1ea6f5bd3a14dcbf78f9149ab79f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 06:08:17 +0000 Subject: [PATCH 04/37] Inform user of QC process and output --- PopPUNK/__main__.py | 11 ++++++++--- PopPUNK/qc.py | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 6960dc11..aa59b15f 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -405,7 +405,6 @@ def main(): fail_unconditionally[line.rstrip] = ["removed"] # assembly qc - sys.stderr.write("Running sequence QC\n") pass_assembly_qc, fail_assembly_qc = \ sketchlibAssemblyQC(args.ref_db, refList, @@ -413,7 +412,6 @@ def main(): sys.stderr.write(f"{len(fail_assembly_qc)} samples failed\n") # QC pairwise distances to identify long distances indicative of anomalous sequences in the collection - sys.stderr.write("Running distance QC\n") pass_dist_qc, fail_dist_qc = \ qcDistMat(distMat, refList, @@ -430,13 +428,20 @@ def main(): raise RuntimeError('Type isolate ' + qc_dict['type_isolate'] + \ ' not found in isolates after QC; check ' 'name of type isolate and QC options\n') - + + sys.stderr.write(f"{len(passed)} samples passed QC\n") if len(passed) < len(refList): remove_qc_fail(qc_dict, refList, passed, [fail_unconditionally, fail_assembly_qc, fail_dist_qc], args.ref_db, distMat, output, args.strand_preserved, args.threads) + # Plot results + if not args.no_plot: + plot_scatter(distMat, + args.output, + args.output + " distances") + plot_database_evaluations(args.output) #******************************# #* *# diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py index 9ae8359f..5b5f8b49 100755 --- a/PopPUNK/qc.py +++ b/PopPUNK/qc.py @@ -153,7 +153,17 @@ def sketchlibAssemblyQC(prefix, names, qc_dict): import h5py from .sketchlib import removeFromDB + # Make user aware of all filters being used (including defaults) sys.stderr.write("Running QC on sketches\n") + if (qc_dict['upper_n'] is not None: + sys.stderr.write("Using count cutoff for ambiguous bases: " + str(qc_dict['upper_n']) + "\n") + else: + sys.stderr.write("Using proportion cutoff for ambiguous bases: " + str(qc_dict['prop_n']) + "\n") + if qc_dict['length_range'][0] is None: + sys.stderr.write("Using standard deviation for length cutoff: " + str(qc_dict['length_sigma']) + "\n") + else: + sys.stderr.write("Using range for length cutoffs: " + str(qc_dict['length_range'][0]) + " - " + \ + str(qc_dict['length_range'][1]) + "\n") # open databases db_name = prefix + '/' + os.path.basename(prefix) + '.h5' @@ -245,6 +255,12 @@ def qcDistMat(distMat, refList, queryList, ref_db, qc_dict): failed (dict) List of sequences failing, and reasons """ + # Make user aware of all filters being used (including defaults) + sys.stderr.write("Running QC on distances\n") + sys.stderr.write("Using cutoff for core distances: " + str(qc_dict['max_pi_dist']) + "\n") + sys.stderr.write("Using cutoff for accessory distances: " + str(qc_dict['max_a_dist']) + "\n") + sys.stderr.write("Using cutoff for proportion of zero distances: " + str(qc_dict['prop_zero']) + "\n") + # Create overall list of sequences if refList == queryList: names = refList From 316283a50afcb98df124c668faf7c5e659c6eae1 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 06:16:54 +0000 Subject: [PATCH 05/37] Fix bracket --- PopPUNK/qc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py index 5b5f8b49..14edbdf7 100755 --- a/PopPUNK/qc.py +++ b/PopPUNK/qc.py @@ -155,7 +155,7 @@ def sketchlibAssemblyQC(prefix, names, qc_dict): # Make user aware of all filters being used (including defaults) sys.stderr.write("Running QC on sketches\n") - if (qc_dict['upper_n'] is not None: + if qc_dict['upper_n'] is not None: sys.stderr.write("Using count cutoff for ambiguous bases: " + str(qc_dict['upper_n']) + "\n") else: sys.stderr.write("Using proportion cutoff for ambiguous bases: " + str(qc_dict['prop_n']) + "\n") From 7041cf75355b5e82006f3447fd693af7158f6834 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 06:34:22 +0000 Subject: [PATCH 06/37] Fix plot names --- PopPUNK/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index b651e0e8..0b34243e 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -133,7 +133,7 @@ def plot_evaluation_histogram(input_data, n_bins = 100, prefix = 'hist', plt.title(plt_title) plt.xlabel(xlab) plt.ylabel('Frequency') - plt.savefig(os.path.join(prefix, os.path.basename(prefix) + suffix + '.png')) + plt.savefig(os.path.join(prefix, os.path.basename(prefix) + '_' + suffix + '.png')) plt.savefig(os.path.join(prefix,prefix + '.png')) plt.close() From 185c939765694984f9740dea3c906cb095d1744e Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 06:36:10 +0000 Subject: [PATCH 07/37] Fix plot prefixes --- PopPUNK/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index aa59b15f..61dd4b47 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -439,9 +439,9 @@ def main(): # Plot results if not args.no_plot: plot_scatter(distMat, - args.output, - args.output + " distances") - plot_database_evaluations(args.output) + output, + output + " distances") + plot_database_evaluations(output) #******************************# #* *# From bda439852dcc7694d518fb07afec1e0b7e26519a Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 11:10:36 +0000 Subject: [PATCH 08/37] Validated update of https://github.com/rapidsai/cugraph/pull/2671 --- PopPUNK/network.py | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3c9c6a4e..cf40e0f9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1105,27 +1105,6 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = return G -def get_cugraph_triangles(G): - """Counts the number of triangles in a cugraph - network. Can be removed when the cugraph issue - https://github.com/rapidsai/cugraph/issues/1043 is fixed. - - Args: - G (cugraph network) - Network to be analysed - - Returns: - triangle_count (int) - Count of triangles in graph - """ - nlen = G.number_of_vertices() - df = G.view_edge_list() - A = cp.full((nlen, nlen), 0, dtype = cp.int32) - A[df.src.values, df.dst.values] = 1 - A = cp.maximum( A, A.transpose() ) - triangle_count = int(cp.around(cp.trace(cp.matmul(A, cp.matmul(A, A)))/6,0)) - return triangle_count - def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sample_default, use_gpu = False): """Provides summary values about the network @@ -1150,11 +1129,7 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) - # consistent with graph-tool for small graphs - triangle counts differ for large graphs - # could reflect issue https://github.com/rapidsai/cugraph/issues/1043 - # this command can be restored once the above issue is fixed - scheduled for cugraph 0.20 -# triangle_count = cugraph.community.triangle_count.triangles(G)/3 - triangle_count = 3*get_cugraph_triangles(G) + triangle_count = cugraph.community.triangle_count.triangles(G)/3 degree_df = G.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) From d25cd5bced12815cef2f240566b6cd1573e8540e Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 13:55:40 +0000 Subject: [PATCH 09/37] Enable subsampling for graph analysis --- PopPUNK/__main__.py | 9 ++++++- PopPUNK/models.py | 6 ++++- PopPUNK/network.py | 64 ++++++++++++++++++++++++++++++++++++++------- PopPUNK/refine.py | 24 ++++++++++++++--- 4 files changed, 88 insertions(+), 15 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 61dd4b47..65f01eb2 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -121,7 +121,7 @@ def get_options(): modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float) # model refinement - refinementGroup = parser.add_argument_group('Refine model options') + refinementGroup = parser.add_argument_group('Network analysis and model refinement options') refinementGroup.add_argument('--pos-shift', help='Maximum amount to move the boundary right past between-strain mean', type=float, default = 0) refinementGroup.add_argument('--neg-shift', help='Maximum amount to move the boundary left past within-strain mean]', @@ -133,6 +133,9 @@ def get_options(): refinementGroup.add_argument('--score-idx', help='Index of score to use [default = 0]', type=int, default = 0, choices=[0, 1, 2]) + refinementGroup.add_argument('--summary-sample', + help='Number of sequences used to estimate graph properties [default = all]', + type=int, default = None) refinementGroup.add_argument('--betweenness-sample', help='Number of sequences used to estimate betweeness with a GPU [default = 100]', type = int, default = betweenness_sample_default) @@ -518,6 +521,7 @@ def main(): args.score_idx, args.no_local, args.betweenness_sample, + args.summary_sample, args.gpu_graph) model = new_model elif args.fit_model == "threshold": @@ -581,6 +585,7 @@ def main(): model.within_label, distMat = distMat, weights_type = weights_type, + sample_size = args.summary_sample, betweenness_sample = args.betweenness_sample, use_gpu = args.gpu_graph) else: @@ -596,6 +601,7 @@ def main(): refList, assignments[rank], weights = weights, + sample_size = args.summary_sample, betweenness_sample = args.betweenness_sample, use_gpu = args.gpu_graph, summarise = False @@ -653,6 +659,7 @@ def main(): queryList, indivAssignments, model.within_label, + sample_size = args.summary_sample, betweenness_sample = args.betweenness_sample, use_gpu = args.gpu_graph) isolateClustering[dist_type] = \ diff --git a/PopPUNK/models.py b/PopPUNK/models.py index d51f83f4..0b851c9b 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -702,7 +702,7 @@ def __init__(self, outPrefix): def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False, unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False, - betweenness_sample = betweenness_sample_default, use_gpu = False): + sample_size = None, betweenness_sample = betweenness_sample_default, use_gpu = False): '''Extends :func:`~ClusterFit.fit` Fits the distances by optimising network score, by calling @@ -744,6 +744,8 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi betweenness_sample (int) Number of sequences per component used to estimate betweenness using a GPU. Smaller numbers are faster but less precise [default = 100] + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use cugraph for graph analyses @@ -791,6 +793,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi no_local = no_local, num_processes = self.threads, betweenness_sample = betweenness_sample, + sample_size = sample_size, use_gpu = use_gpu) self.fitted = True @@ -830,6 +833,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi no_local = no_local, num_processes = self.threads, betweenness_sample = betweenness_sample, + sample_size = sample_size, use_gpu = use_gpu) if dist_type == "core": self.core_boundary = core_boundary diff --git a/PopPUNK/network.py b/PopPUNK/network.py index cf40e0f9..d9fdef3c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -548,12 +548,14 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, else: return source_ids, target_ids -def print_network_summary(G, betweenness_sample = betweenness_sample_default, use_gpu = False): +def print_network_summary(G, sample_size = None, betweenness_sample = betweenness_sample_default, use_gpu = False): """Wrapper function for printing network information Args: G (graph) List of reference sequence labels + sample_size (int) + Number of nodes to subsample for graph statistic calculation betweenness_sample (int) Number of sequences per component used to estimate betweenness using a GPU. Smaller numbers are faster but less precise [default = 100] @@ -561,7 +563,10 @@ def print_network_summary(G, betweenness_sample = betweenness_sample_default, us Whether to use GPUs for network construction """ # print some summaries - (metrics, scores) = networkSummary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + (metrics, scores) = networkSummary(G, + subsample = sample_size, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]), "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]), "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]), @@ -672,6 +677,7 @@ def construct_network_from_edge_list(rlist, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, + sample_size = None, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -706,6 +712,8 @@ def construct_network_from_edge_list(rlist, summarise (bool) Whether to calculate and print network summaries with :func:`~networkSummary` (default = True) + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use GPUs for network construction @@ -780,7 +788,10 @@ def construct_network_from_edge_list(rlist, else: G.add_edge_list(edge_list) if summarise: - print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + print_network_summary(G, + sample_size = sample_size, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) return G @@ -795,6 +806,7 @@ def construct_network_from_df(rlist, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, + sample_size = None, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -829,6 +841,8 @@ def construct_network_from_df(rlist, summarise (bool) Whether to calculate and print network summaries with :func:`~networkSummary` (default = True) + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use GPUs for network construction @@ -894,7 +908,10 @@ def construct_network_from_df(rlist, summarise = False, use_gpu = use_gpu) if summarise: - print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + print_network_summary(G, + sample_size = sample_size, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) return G def construct_network_from_sparse_matrix(rlist, @@ -905,6 +922,7 @@ def construct_network_from_sparse_matrix(rlist, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, + sample_size = None, use_gpu = False): """Construct an undirected network using a sparse matrix. Nodes are samples and edges where samples are within the same cluster @@ -933,6 +951,8 @@ def construct_network_from_sparse_matrix(rlist, summarise (bool) Whether to calculate and print network summaries with :func:`~networkSummary` (default = True) + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use GPUs for network construction @@ -956,7 +976,10 @@ def construct_network_from_sparse_matrix(rlist, summarise = False, use_gpu = use_gpu) if summarise: - print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + print_network_summary(G, + sample_size = sample_size, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) return G def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gpu = False): @@ -1024,7 +1047,7 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, - summarise = True, use_gpu = False): + summarise = True, sample_size = None, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances to clusters, and the identifier of the cluster assigned to within-strain distances. Nodes are samples and edges where samples are within the same cluster @@ -1066,6 +1089,8 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = summarise (bool) Whether to calculate and print network summaries with :func:`~networkSummary` (default = True) + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use GPUs for network construction @@ -1101,17 +1126,22 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = summarise = False, use_gpu = use_gpu) if summarise: - print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + print_network_summary(G, + sample_size = sample_size, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) return G -def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sample_default, - use_gpu = False): +def networkSummary(G, subsample = None, calc_betweenness=True, + betweenness_sample = betweenness_sample_default, use_gpu = False): """Provides summary values about the network Args: G (graph) The network of strains + subsample (int) + Number of vertices to randomly subsample from graph calc_betweenness (bool) Whether to calculate betweenness stats use_gpu (bool) @@ -1125,6 +1155,13 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa List of scores """ if use_gpu: + if subsample is None: + S = G + else: + vertex_subsample = cp.random.choice(cp.arange(0,G.number_of_vertices() - 1), + size = subsample, + replace = False) + S = cugraph.subgraph(G, vertex_subsample) component_assignments = cugraph.components.connectivity.connected_components(G) component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) @@ -1138,6 +1175,15 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa else: transitivity = 0.0 else: + if subsample is None: + S = G + else: + vfilt = g.new_vertex_property('bool', val = False) + vertex_subsample = np.random.choice(np.arange(0,len(list(G.vertices())) - 1), + size = subsample, + replace = False) + vfilt[vertex_subsample] = True + S = gt.GraphView(G, vfilt=vfilt) component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 3239bb0a..38c67f7f 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -88,6 +88,8 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, betweenness_sample (int) Number of sequences per component used to estimate betweenness using a GPU. Smaller numbers are faster but less precise [default = 100] + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use cugraph for graph analyses @@ -134,6 +136,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, y_range = y_max, score_idx = score_idx, betweenness_sample = betweenness_sample, + sample_size = sample_size, use_gpu = True), range(global_grid_resolution)) else: @@ -154,6 +157,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, y_range = y_max, score_idx = score_idx, betweenness_sample = betweenness_sample, + sample_size = sample_size, use_gpu = False), range(global_grid_resolution)) @@ -202,6 +206,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, s_range, score_idx, betweenness_sample = betweenness_sample, + sample_size = sample_size, use_gpu = use_gpu)) global_s[np.isnan(global_s)] = 1 min_idx = np.argmin(np.array(global_s)) @@ -221,7 +226,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, method = 'Bounded', options={'disp': True}, args = (sample_names, distMat, mean0, mean1, gradient, slope, score_idx, num_processes, - betweenness_sample, use_gpu) + betweenness_sample, sample_size, use_gpu) ) optimised_s = local_s.x @@ -295,6 +300,7 @@ def multi_refine(distMat, sample_names, mean0, mean1, scale, s_max, s_range, 0, write_clusters = output_prefix, + sample_size = sample_size, use_gpu = use_gpu) def check_search_range(scale, mean0, mean1, lower_s, upper_s): @@ -360,7 +366,7 @@ def expand_cugraph_network(G, G_extra_df): def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, thread_idx = 0, betweenness_sample = betweenness_sample_default, - write_clusters = None, use_gpu = False): + write_clusters = None, sample_size = None, use_gpu = False): """Construct a network, then add edges to it iteratively. Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D`` @@ -387,6 +393,8 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, write_clusters (str) Set to a prefix to write the clusters from each position to files [default = None] + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use cugraph for graph analyses @@ -435,6 +443,7 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, # Add score into vector for any offsets passed (should usually just be one) G_summary = networkSummary(G, score_idx > 0, + subsample = sample_size, betweenness_sample = betweenness_sample, use_gpu = use_gpu) latest_score = -G_summary[1][score_idx] @@ -458,7 +467,7 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, def newNetwork(s, sample_names, distMat, mean0, mean1, gradient, slope=2, score_idx=0, cpus=1, betweenness_sample = betweenness_sample_default, - use_gpu = False): + sample_size = None, use_gpu = False): """Wrapper function for :func:`~PopPUNK.network.construct_network_from_edge_list` which is called by optimisation functions moving a triangular decision boundary. @@ -490,6 +499,8 @@ def newNetwork(s, sample_names, distMat, mean0, mean1, gradient, betweenness_sample (int) Number of sequences per component used to estimate betweenness using a GPU. Smaller numbers are faster but less precise [default = 100] + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use cugraph for graph analysis @@ -523,12 +534,14 @@ def newNetwork(s, sample_names, distMat, mean0, mean1, gradient, # Return score score = networkSummary(G, score_idx > 0, + subsample = sample_size, betweenness_sample = betweenness_sample, use_gpu = use_gpu)[1][score_idx] return(-score) def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, - betweenness_sample = betweenness_sample_default, use_gpu = False): + betweenness_sample = betweenness_sample_default, sample_size = None, + use_gpu = False): """Wrapper function for thresholdIterate2D and :func:`growNetwork`. For a given y_max, constructs networks across x_range and returns a list @@ -551,6 +564,8 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, betweenness_sample (int) Number of sequences per component used to estimate betweenness using a GPU. Smaller numbers are faster but less precise [default = 100] + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use cugraph for graph analysis @@ -581,6 +596,7 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, score_idx, y_idx, betweenness_sample, + sample_size = sample_size, use_gpu = use_gpu) return(scores) From 817f7ab01b270e371fc54c25ddb098d4da082623 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 13:59:57 +0000 Subject: [PATCH 10/37] Update triangle counting --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d9fdef3c..2ce369d9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1166,7 +1166,7 @@ def networkSummary(G, subsample = None, calc_betweenness=True, component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) - triangle_count = cugraph.community.triangle_count.triangles(G)/3 + triangle_count = cugraph.triangle_count(G)/3 degree_df = G.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) From 3e583862ee455bee9c360867e8e1bf3f1895ab26 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 14:48:13 +0000 Subject: [PATCH 11/37] Fix argument parsing --- PopPUNK/models.py | 2 +- PopPUNK/network.py | 13 ++++++++----- PopPUNK/refine.py | 5 +++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 0b851c9b..1309b11c 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -702,7 +702,7 @@ def __init__(self, outPrefix): def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False, unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False, - sample_size = None, betweenness_sample = betweenness_sample_default, use_gpu = False): + betweenness_sample = betweenness_sample_default, sample_size = None, use_gpu = False): '''Extends :func:`~ClusterFit.fit` Fits the distances by optimising network score, by calling diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2ce369d9..cbc26214 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1133,17 +1133,20 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = return G -def networkSummary(G, subsample = None, calc_betweenness=True, - betweenness_sample = betweenness_sample_default, use_gpu = False): +def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sample_default, + subsample = None, use_gpu = False): """Provides summary values about the network Args: G (graph) The network of strains - subsample (int) - Number of vertices to randomly subsample from graph calc_betweenness (bool) Whether to calculate betweenness stats + betweenness_sample (int) + Number of sequences per component used to estimate betweenness using + a GPU. Smaller numbers are faster but less precise [default = 100] + subsample (int) + Number of vertices to randomly subsample from graph use_gpu (bool) Whether to use cugraph for graph analysis @@ -1178,7 +1181,7 @@ def networkSummary(G, subsample = None, calc_betweenness=True, if subsample is None: S = G else: - vfilt = g.new_vertex_property('bool', val = False) + vfilt = G.new_vertex_property('bool', val = False) vertex_subsample = np.random.choice(np.arange(0,len(list(G.vertices())) - 1), size = subsample, replace = False) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 38c67f7f..350553ef 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -51,7 +51,8 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, max_move, min_move, slope = 2, score_idx = 0, unconstrained = False, no_local = False, num_processes = 1, - betweenness_sample = betweenness_sample_default, use_gpu = False): + betweenness_sample = betweenness_sample_default, sample_size = None, + use_gpu = False): """Try to refine a fit by maximising a network score based on transitivity and density. Iteratively move the decision boundary to do this, using starting point from existing model. @@ -443,8 +444,8 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, # Add score into vector for any offsets passed (should usually just be one) G_summary = networkSummary(G, score_idx > 0, - subsample = sample_size, betweenness_sample = betweenness_sample, + subsample = sample_size, use_gpu = use_gpu) latest_score = -G_summary[1][score_idx] for s in range(prev_idx, idx): From 17e3af7f8ee76fbbeffdd7416cd80819e9838b74 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 15:00:06 +0000 Subject: [PATCH 12/37] Use subgraph for statistics --- PopPUNK/network.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index cbc26214..1b4e937b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1165,12 +1165,12 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa size = subsample, replace = False) S = cugraph.subgraph(G, vertex_subsample) - component_assignments = cugraph.components.connectivity.connected_components(G) + component_assignments = cugraph.components.connectivity.connected_components(S) component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) - density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) - triangle_count = cugraph.triangle_count(G)/3 - degree_df = G.in_degree() + density = S.number_of_edges()/(0.5 * S.number_of_vertices() * S.number_of_vertices() - 1) + triangle_count = cugraph.triangle_count(S)/3 + degree_df = S.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) if triad_count > 0: @@ -1187,10 +1187,10 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa replace = False) vfilt[vertex_subsample] = True S = gt.GraphView(G, vfilt=vfilt) - component_assignments, component_frequencies = gt.label_components(G) + component_assignments, component_frequencies = gt.label_components(S) components = len(component_frequencies) - density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) - transitivity = gt.global_clustering(G)[0] + density = len(list(S.edges()))/(0.5 * len(list(S.vertices())) * (len(list(S.vertices())) - 1)) + transitivity = gt.global_clustering(S)[0] mean_bt = 0 weighted_mean_bt = 0 @@ -1204,7 +1204,7 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int) if size > 3: component_vertices = component_assignments['vertex'][component_assignments['labels']==component] - subgraph = cugraph.subgraph(G, component_vertices) + subgraph = cugraph.subgraph(S, component_vertices) if len(component_vertices) >= betweenness_sample: component_betweenness = cugraph.betweenness_centrality(subgraph, k = betweenness_sample, @@ -1218,7 +1218,7 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa for component, size in enumerate(component_frequencies): if size > 3: vfilt = component_assignments.a == component - subgraph = gt.GraphView(G, vfilt=vfilt) + subgraph = gt.GraphView(S, vfilt=vfilt) betweenness.append(max(vertex_betweenness(subgraph, norm=True))) sizes.append(size) From ae4958305cf97de0b8a8de1d64fce8041846318a Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 15:04:23 +0000 Subject: [PATCH 13/37] Update network count --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1b4e937b..6ee8d5ad 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1169,7 +1169,7 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) density = S.number_of_edges()/(0.5 * S.number_of_vertices() * S.number_of_vertices() - 1) - triangle_count = cugraph.triangle_count(S)/3 + triangle_count = cugraph.triangle_count(S)['counts'].sum()/3 degree_df = S.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) From 934f8f5eb0369b84490bd997bb248ce69281c888 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 15:06:10 +0000 Subject: [PATCH 14/37] Remove correction factor --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6ee8d5ad..ee679215 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1169,7 +1169,7 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa component_nums = component_assignments['labels'].unique().astype(int) components = len(component_nums) density = S.number_of_edges()/(0.5 * S.number_of_vertices() * S.number_of_vertices() - 1) - triangle_count = cugraph.triangle_count(S)['counts'].sum()/3 + triangle_count = cugraph.triangle_count(S)['counts'].sum() degree_df = S.in_degree() # consistent with graph-tool triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()]) From 5c29da10cf12d84097c75d40d96e71cf1976a6c1 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 15:26:35 +0000 Subject: [PATCH 15/37] Fix subsampling for graph-tool --- PopPUNK/network.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ee679215..d01494ec 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1181,11 +1181,12 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa if subsample is None: S = G else: - vfilt = G.new_vertex_property('bool', val = False) vertex_subsample = np.random.choice(np.arange(0,len(list(G.vertices())) - 1), size = subsample, replace = False) - vfilt[vertex_subsample] = True + vfilt_bool = np.full(len(list(G.vertices())) - 1, False) + vfilt_bool[vertex_subsample] = True + vfilt = G.new_vertex_property('bool', vals = vfilt_bool) S = gt.GraphView(G, vfilt=vfilt) component_assignments, component_frequencies = gt.label_components(S) components = len(component_frequencies) From 16c274a14eded49bec76f4ac6af216020f9b8570 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 15:29:50 +0000 Subject: [PATCH 16/37] Add tests for subsampling --- test/run_test.py | 1 + test/test-gpu.py | 1 + 2 files changed, 2 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index 5f26a2f8..68f3d01f 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -43,6 +43,7 @@ subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.15 --overwrite --score-idx 1", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.15 --overwrite --score-idx 2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.15 --summary-sample 15 --overwrite", shell=True, check=True) sys.stderr.write("Running multi boundary refinement (--multi-boundary and poppunk_iterate.py)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_iterate --neg-shift -0.2 --overwrite --multi-boundary 10", shell=True, check=True) diff --git a/test/test-gpu.py b/test/test-gpu.py index 58af75c2..32fa4d6e 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -43,6 +43,7 @@ subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 1 --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 2 --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.15 --summary-sample 15 --overwrite --gpu-graph", shell=True, check=True) # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") From 75ef0d52f3ba456f55ab1e120dc17de9707a606f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 15:41:56 +0000 Subject: [PATCH 17/37] Enable sampling for multirefine --- PopPUNK/models.py | 2 ++ PopPUNK/refine.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 1309b11c..62c9620b 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -809,6 +809,8 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi multi_boundary, self.outPrefix, num_processes = self.threads, + betweenness_sample = betweenness_sample, + sample_size = sample_size, use_gpu = use_gpu) # Try and do a 1D refinement for both core and accessory diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 350553ef..aa0f9a00 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -247,8 +247,9 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, return optimal_x, optimal_y, optimised_s def multi_refine(distMat, sample_names, mean0, mean1, scale, s_max, - n_boundary_points, output_prefix, - num_processes = 1, use_gpu = False): + n_boundary_points, output_prefix, num_processes = 1, + betweenness_sample = betweenness_sample_default, sample_size = None, + use_gpu = False): """Move the refinement boundary between the optimum and where it meets an axis. Discrete steps, output the clusers at each step @@ -270,6 +271,11 @@ def multi_refine(distMat, sample_names, mean0, mean1, scale, s_max, num_processes (int) Number of threads to use in the global optimisation step. (default = 1) + betweenness_sample (int) + Number of sequences per component used to estimate betweenness using + a GPU. Smaller numbers are faster but less precise [default = 100] + sample_size (int) + Number of nodes to subsample for graph statistic calculation use_gpu (bool) Whether to use cugraph for graph analyses """ @@ -301,6 +307,7 @@ def multi_refine(distMat, sample_names, mean0, mean1, scale, s_max, s_range, 0, write_clusters = output_prefix, + betweenness_sample = betweenness_sample, sample_size = sample_size, use_gpu = use_gpu) From bde0a3ec8981ccaf5e2097fcf35053978313ff2f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 1 Feb 2023 16:17:29 +0000 Subject: [PATCH 18/37] Enable extraction of full graph statistics --- PopPUNK/info.py | 55 +++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index 3e0607c7..8baf9599 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -52,9 +52,13 @@ def get_options(): # main code def main(): + # Import value + from .__main__ import betweenness_sample_default + # Import functions from .network import load_network_file from .network import sparse_mat_to_network + from .network import print_network_summary from .utils import check_and_set_gpu from .utils import setGtThreads @@ -103,6 +107,32 @@ def main(): use_rc = ref_db['sketches'].attrs['use_rc'] == 1 print("Uses canonical k-mers:\t" + str(use_rc)) + # Select network file name + if args.network_file is None: + if use_gpu: + network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz') + else: + network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt') + else: + network_file = args.network_file + + # Open network file + if network_file.endswith('.gt'): + G = load_network_file(network_file, use_gpu = False) + elif network_file.endswith('.csv.gz'): + if use_gpu: + G = load_network_file(network_file, use_gpu = True) + else: + sys.stderr.write('Unable to load necessary GPU libraries\n') + sys.exit(1) + elif network_file.endswith('.npz'): + sparse_mat = sparse.load_npz(network_file) + G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) + else: + sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') + sys.exit(1) + print_network_summary(G, betweenness_sample = betweenness_sample_default, use_gpu = args.use_gpu) + # Print sample information if not args.simple: sample_names = list(ref_db['sketches'].keys()) @@ -115,31 +145,6 @@ def main(): sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] - # Select network file name - if args.network_file is None: - if use_gpu: - network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz') - else: - network_file = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt') - else: - network_file = args.network_file - - # Open network file - if network_file.endswith('.gt'): - G = load_network_file(network_file, use_gpu = False) - elif network_file.endswith('.csv.gz'): - if use_gpu: - G = load_network_file(network_file, use_gpu = True) - else: - sys.stderr.write('Unable to load necessary GPU libraries\n') - sys.exit(1) - elif network_file.endswith('.npz'): - sparse_mat = sparse.load_npz(network_file) - G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) - else: - sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') - sys.exit(1) - # Analyse network if use_gpu: component_assignments_df = cugraph.components.connectivity.connected_components(G) From 757eeffeecfd51f8f792989e51fd024ea080852b Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 2 Feb 2023 06:48:02 +0000 Subject: [PATCH 19/37] Update function docstring --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d01494ec..23b8ab1f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -689,8 +689,8 @@ def construct_network_from_edge_list(rlist, List of reference sequence labels qlist (list) List of query sequence labels - G_df (cudf or pandas data frame) - Data frame in which the first two columns are the nodes linked by edges + edge_list (list of tuples) + List of connections in the network weights (list) List of edge weights distMat (2 column ndarray) From 7311f71611493ed1fd6fda5544dbd0347eddcbc1 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 3 Feb 2023 14:49:15 +0000 Subject: [PATCH 20/37] Allow for tmp file space --- PopPUNK/trees.py | 9 +++++++-- PopPUNK/visualise.py | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/PopPUNK/trees.py b/PopPUNK/trees.py index 1891c0d0..a2e3bf72 100644 --- a/PopPUNK/trees.py +++ b/PopPUNK/trees.py @@ -147,7 +147,7 @@ def load_tree(prefix, type, distances = 'core'): return tree_string -def generate_nj_tree(coreMat, seqLabels, outPrefix, rapidnj, threads): +def generate_nj_tree(coreMat, seqLabels, outPrefix, tmp = None, rapidnj = None, threads = 1): """Generate phylogeny using dendropy or RapidNJ Writes a neighbour joining tree (.nwk) from core distances. @@ -159,6 +159,8 @@ def generate_nj_tree(coreMat, seqLabels, outPrefix, rapidnj, threads): Processed names of sequences being analysed. outPrefix (str) Output prefix for core distances file + tmp (str) + Directory in which to create large temporary pairwise distance file rapidnj (str) A string with the location of the rapidnj executable for tree-building. If None, will use dendropy by default @@ -174,7 +176,10 @@ def generate_nj_tree(coreMat, seqLabels, outPrefix, rapidnj, threads): # calculate phylogeny sys.stderr.write("Building phylogeny\n") if rapidnj is not None: - core_dist_file = outPrefix + "/" + os.path.basename(outPrefix) + "_core_dists.csv" + if tmp is None: + core_dist_file = outPrefix + "/" + os.path.basename(outPrefix) + "_core_dists.csv" + else: + core_dist_file = tmp + "/" + os.path.basename(outPrefix) + "_core_dists.csv" np.savetxt(core_dist_file, coreMat, delimiter=",", header = ",".join(seqLabels), comments="") tree = buildRapidNJ(rapidnj, seqLabels, coreMat, outPrefix, threads = threads) os.remove(core_dist_file) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 4669da1b..069f55c5 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -134,6 +134,7 @@ def get_options(): other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating graphs [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') + other.add_argument('--tmp', default=None, type=str, help='Directory for large temporary files') other.add_argument('--strand-preserved', default=False, action='store_true', help='If distances being calculated, treat strand as known when calculating random ' 'match chances [default = False]') @@ -184,7 +185,8 @@ def generate_visualisations(query_db, tree, mst_distances, overwrite, - display_cluster): + display_cluster, + tmp): from .models import loadClusterFit @@ -529,6 +531,7 @@ def generate_visualisations(query_db, nj_tree = generate_nj_tree(core_distMat, combined_seq, output, + tmp, rapidnj, threads = threads) else: @@ -650,7 +653,8 @@ def main(): args.tree, args.mst_distances, args.overwrite, - args.display_cluster) + args.display_cluster, + args.tmp) if __name__ == '__main__': main() From 9e010692fc6586d59c10f111d694b0a08186f1bb Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 3 Feb 2023 16:33:44 +0000 Subject: [PATCH 21/37] Reduce precision and size of distance matrix --- PopPUNK/trees.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/PopPUNK/trees.py b/PopPUNK/trees.py index a2e3bf72..66b3374e 100644 --- a/PopPUNK/trees.py +++ b/PopPUNK/trees.py @@ -180,7 +180,13 @@ def generate_nj_tree(coreMat, seqLabels, outPrefix, tmp = None, rapidnj = None, core_dist_file = outPrefix + "/" + os.path.basename(outPrefix) + "_core_dists.csv" else: core_dist_file = tmp + "/" + os.path.basename(outPrefix) + "_core_dists.csv" - np.savetxt(core_dist_file, coreMat, delimiter=",", header = ",".join(seqLabels), comments="") + np.savetxt(core_dist_file, + coreMat, + fmt='%.4e', + delimiter=",", + header = ",".join(seqLabels), + comments="" + ) tree = buildRapidNJ(rapidnj, seqLabels, coreMat, outPrefix, threads = threads) os.remove(core_dist_file) else: From c36489ab3c73cd8b221691a8955a6ae6b6e4fbd6 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 3 Feb 2023 16:42:35 +0000 Subject: [PATCH 22/37] Make indentation consistent --- PopPUNK/lineages.py | 172 ++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py index 38f0300e..298ab105 100755 --- a/PopPUNK/lineages.py +++ b/PopPUNK/lineages.py @@ -189,92 +189,92 @@ def create_db(args): lineage_dbs = {} overall_lineage = {} for strain,isolates in strains: - # Make new database directory - sys.stderr.write("Making database for strain " + str(strain) + "\n") - strain_db_name = args.lineage_db_prefix + '_' + str(strain) + '_lineage_db' - isolate_list = isolates[isolates.columns.values[0]].to_list() - num_isolates = len(isolate_list) - if num_isolates >= args.min_count: - lineage_dbs[strain] = strain_db_name - if os.path.isdir(strain_db_name) and args.overwrite: - os.rmdir(strain_db_name) - if not os.path.isdir(strain_db_name): - try: - os.makedirs(strain_db_name) - except OSError: - sys.stderr.write("Cannot create output directory " + strain_db_name + "\n") - sys.exit(1) - # Make link to main database - src_db = os.path.join(args.create_db,os.path.basename(args.create_db) + '.h5') - dest_db = os.path.join(strain_db_name,os.path.basename(strain_db_name) + '.h5') - rel_path = os.path.relpath(src_db, os.path.dirname(dest_db)) - if os.path.exists(dest_db) and args.overwrite: - os.remove(dest_db) - elif not os.path.exists(dest_db): - os.symlink(rel_path,dest_db) - # Extract sparse distances - prune_distance_matrix(rlist, - list(set(rlist) - set(isolate_list)), - X, - os.path.join(strain_db_name,strain_db_name + '.dists')) - # Initialise model - model = LineageFit(strain_db_name, - rank_list, - max_search_depth, - args.reciprocal_only, - args.count_unique_distances, - use_gpu = args.gpu_graph) - model.set_threads(args.threads) - # Load pruned distance matrix - strain_rlist, strain_qlist, strain_self, strain_X = \ - readPickle(os.path.join(strain_db_name,strain_db_name + '.dists'), - enforce_self=False, - distances=True) - # Fit model - model.fit(strain_X, - args.use_accessory) - # Lineage fit requires some iteration - indivNetworks = {} - lineage_clusters = defaultdict(dict) - # Iterate over ranks - for rank in rank_list: - if rank <= num_isolates: - assignments = model.assign(rank) - # Generate networks - indivNetworks[rank] = construct_network_from_edge_list(strain_rlist, - strain_rlist, - assignments, - weights = None, - betweenness_sample = None, - use_gpu = args.gpu_graph, - summarise = False - ) - # Write networks - save_network(indivNetworks[rank], - prefix = strain_db_name, - suffix = '_rank_' + str(rank) + '_graph', - use_gpu = args.gpu_graph) - # Identify clusters from output - lineage_clusters[rank] = \ - printClusters(indivNetworks[rank], - strain_rlist, - printCSV = False, - use_gpu = args.gpu_graph) - n_clusters = max(lineage_clusters[rank].values()) - sys.stderr.write("Network for rank " + str(rank) + " has " + - str(n_clusters) + " lineages\n") - # For each strain, print output of each rank as CSV - overall_lineage[strain] = createOverallLineage(rank_list, lineage_clusters) - writeClusterCsv(os.path.join(strain_db_name,os.path.basename(strain_db_name) + '_lineages.csv'), - strain_rlist, - strain_rlist, - overall_lineage[strain], - output_format = 'phandango', - epiCsv = None, - suffix = '_Lineage') - genomeNetwork = indivNetworks[min(rank_list)] - # Save model - model.save() + # Make new database directory + sys.stderr.write("Making database for strain " + str(strain) + "\n") + strain_db_name = args.lineage_db_prefix + '_' + str(strain) + '_lineage_db' + isolate_list = isolates[isolates.columns.values[0]].to_list() + num_isolates = len(isolate_list) + if num_isolates >= args.min_count: + lineage_dbs[strain] = strain_db_name + if os.path.isdir(strain_db_name) and args.overwrite: + os.rmdir(strain_db_name) + if not os.path.isdir(strain_db_name): + try: + os.makedirs(strain_db_name) + except OSError: + sys.stderr.write("Cannot create output directory " + strain_db_name + "\n") + sys.exit(1) + # Make link to main database + src_db = os.path.join(args.create_db,os.path.basename(args.create_db) + '.h5') + dest_db = os.path.join(strain_db_name,os.path.basename(strain_db_name) + '.h5') + rel_path = os.path.relpath(src_db, os.path.dirname(dest_db)) + if os.path.exists(dest_db) and args.overwrite: + os.remove(dest_db) + elif not os.path.exists(dest_db): + os.symlink(rel_path,dest_db) + # Extract sparse distances + prune_distance_matrix(rlist, + list(set(rlist) - set(isolate_list)), + X, + os.path.join(strain_db_name,strain_db_name + '.dists')) + # Initialise model + model = LineageFit(strain_db_name, + rank_list, + max_search_depth, + args.reciprocal_only, + args.count_unique_distances, + use_gpu = args.gpu_graph) + model.set_threads(args.threads) + # Load pruned distance matrix + strain_rlist, strain_qlist, strain_self, strain_X = \ + readPickle(os.path.join(strain_db_name,strain_db_name + '.dists'), + enforce_self=False, + distances=True) + # Fit model + model.fit(strain_X, + args.use_accessory) + # Lineage fit requires some iteration + indivNetworks = {} + lineage_clusters = defaultdict(dict) + # Iterate over ranks + for rank in rank_list: + if rank <= num_isolates: + assignments = model.assign(rank) + # Generate networks + indivNetworks[rank] = construct_network_from_edge_list(strain_rlist, + strain_rlist, + assignments, + weights = None, + betweenness_sample = None, + use_gpu = args.gpu_graph, + summarise = False + ) + # Write networks + save_network(indivNetworks[rank], + prefix = strain_db_name, + suffix = '_rank_' + str(rank) + '_graph', + use_gpu = args.gpu_graph) + # Identify clusters from output + lineage_clusters[rank] = \ + printClusters(indivNetworks[rank], + strain_rlist, + printCSV = False, + use_gpu = args.gpu_graph) + n_clusters = max(lineage_clusters[rank].values()) + sys.stderr.write("Network for rank " + str(rank) + " has " + + str(n_clusters) + " lineages\n") + # For each strain, print output of each rank as CSV + overall_lineage[strain] = createOverallLineage(rank_list, lineage_clusters) + writeClusterCsv(os.path.join(strain_db_name,os.path.basename(strain_db_name) + '_lineages.csv'), + strain_rlist, + strain_rlist, + overall_lineage[strain], + output_format = 'phandango', + epiCsv = None, + suffix = '_Lineage') + genomeNetwork = indivNetworks[min(rank_list)] + # Save model + model.save() # Print combined strain and lineage clustering print_overall_clustering(overall_lineage,args.output + '.csv',rlist) From 628844699fa02a02f46edc41484b9eb47f6a8005 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 3 Feb 2023 17:03:05 +0000 Subject: [PATCH 23/37] Prune rank list for rare strains --- PopPUNK/lineages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/lineages.py b/PopPUNK/lineages.py index 298ab105..d3ed5b97 100755 --- a/PopPUNK/lineages.py +++ b/PopPUNK/lineages.py @@ -217,6 +217,8 @@ def create_db(args): list(set(rlist) - set(isolate_list)), X, os.path.join(strain_db_name,strain_db_name + '.dists')) + # Prune rank list + pruned_rank_list = [r for r in rank_list if r <= num_isolates] # Initialise model model = LineageFit(strain_db_name, rank_list, From 98e36a8c3f7d38f6abea2e72d2f47482a785b657 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 3 Feb 2023 17:14:40 +0000 Subject: [PATCH 24/37] Remove unnecessary file generation --- PopPUNK/trees.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/PopPUNK/trees.py b/PopPUNK/trees.py index 66b3374e..5d501f4f 100644 --- a/PopPUNK/trees.py +++ b/PopPUNK/trees.py @@ -28,7 +28,7 @@ except ImportError: pass -def buildRapidNJ(rapidnj, refList, coreMat, outPrefix, threads = 1): +def buildRapidNJ(rapidnj, refList, coreMat, outPrefix, tmp = None, threads = 1): """Use rapidNJ for more rapid tree building Creates a phylip of core distances, system call to rapidnj executable, loads tree as @@ -43,6 +43,8 @@ def buildRapidNJ(rapidnj, refList, coreMat, outPrefix, threads = 1): NxN core distance matrix produced in :func:`~outputsForMicroreact` outPrefix (str) Prefix for all generated output files, which will be placed in `outPrefix` subdirectory + tmp (str) + Directory in which to create large temporary pairwise distance file threads (int) Number of threads to use @@ -51,12 +53,15 @@ def buildRapidNJ(rapidnj, refList, coreMat, outPrefix, threads = 1): Newick-formatted NJ tree from core distances """ # generate phylip matrix - phylip_name = outPrefix + "/" + os.path.basename(outPrefix) + "_core_distances.phylip" + if tmp is not None: + phylip_name = tmp + "/" + os.path.basename(outPrefix) + "_core_distances.phylip" + else: + phylip_name = outPrefix + "/" + os.path.basename(outPrefix) + "_core_distances.phylip" with open(phylip_name, 'w') as pFile: pFile.write(str(len(refList))+"\n") for coreDist, ref in zip(coreMat, refList): pFile.write(ref) - pFile.write(' '+' '.join(map(str, coreDist))) + pFile.write(' '+' '.join(map('{:.4f}'.format, coreDist))) pFile.write("\n") # construct tree @@ -176,19 +181,7 @@ def generate_nj_tree(coreMat, seqLabels, outPrefix, tmp = None, rapidnj = None, # calculate phylogeny sys.stderr.write("Building phylogeny\n") if rapidnj is not None: - if tmp is None: - core_dist_file = outPrefix + "/" + os.path.basename(outPrefix) + "_core_dists.csv" - else: - core_dist_file = tmp + "/" + os.path.basename(outPrefix) + "_core_dists.csv" - np.savetxt(core_dist_file, - coreMat, - fmt='%.4e', - delimiter=",", - header = ",".join(seqLabels), - comments="" - ) - tree = buildRapidNJ(rapidnj, seqLabels, coreMat, outPrefix, threads = threads) - os.remove(core_dist_file) + tree = buildRapidNJ(rapidnj, seqLabels, coreMat, outPrefix, tmp = tmp, threads = threads) else: matrix = [] for row, idx in enumerate(coreMat): From 9a19cdc570485664841def74fdc62dccf3876427 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sat, 4 Feb 2023 06:58:58 +0000 Subject: [PATCH 25/37] Fix pandas error handling --- PopPUNK/plot.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 0b34243e..a0196abe 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -15,6 +15,7 @@ import itertools # for other outputs import pandas as pd +from pandas.errors import DataError import h5py from collections import defaultdict from sklearn import utils @@ -728,17 +729,19 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, sys.stderr.write("Parsed data, now writing to CSV\n") try: pd.DataFrame(data=d).to_csv(outfile, columns = colnames, index = False) - except subprocess.CalledProcessError as e: - sys.stderr.write("Problem with epidemiological data CSV; returned code: " + str(e.returncode) + "\n") + except (ValueError,DataError) as e: + sys.stderr.write("Problem with epidemiological data CSV; returned code: " + str(e) + "\n") # check CSV prev_col_items = -1 prev_col_name = "unknown" for col in d: this_col_items = len(d[col]) + sys.stderr.write(col + ' has length ' + str(this_col_items) + '\n') if prev_col_items > -1 and prev_col_items != this_col_items: sys.stderr.write("Discrepant length between " + prev_col_name + \ - " (length of " + prev_col_items + ") and " + \ - col + "(length of " + this_col_items + ")\n") + " (length of " + str(prev_col_items) + ") and " + \ + col + "(length of " + str(this_col_items) + ")\n") + prev_col_items = this_col_items sys.exit(1) def outputsForMicroreact(combined_list, clustering, nj_tree, mst_tree, accMat, perplexity, From df9117f16030480a0f4aecf2e808218ded0a1da3 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 5 Feb 2023 07:12:43 +0000 Subject: [PATCH 26/37] Filter duplicate rows in epi csv --- PopPUNK/plot.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index a0196abe..d9dad098 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -713,13 +713,15 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, d['Status'].append("Reference") if epiCsv is not None: if label in epiData.index: - for col, value in zip(epiData.columns.values, epiData.loc[label].values): - if col not in columns_to_be_omitted: - d[col].append(str(value)) - else: - for col in epiData.columns.values: - if col not in columns_to_be_omitted: - d[col].append('nan') + info_added.append(label) + if label in epiData.index: + for col, value in zip(epiData.columns.values, epiData.loc[[label]].iloc[0].values): + if col not in columns_to_be_omitted: + d[col].append(str(value)) + else: + for col in epiData.columns.values: + if col not in columns_to_be_omitted: + d[col].append('nan') else: sys.stderr.write("Cannot find " + name + " in clustering\n") From a4ab20183544e55623a5512e139a4a0dc36a547f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Sun, 5 Feb 2023 12:11:17 +0000 Subject: [PATCH 27/37] Remove obsolete variable --- PopPUNK/plot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index d9dad098..239b8768 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -713,7 +713,6 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, d['Status'].append("Reference") if epiCsv is not None: if label in epiData.index: - info_added.append(label) if label in epiData.index: for col, value in zip(epiData.columns.values, epiData.loc[[label]].iloc[0].values): if col not in columns_to_be_omitted: From aa637d7f6d94dc95774b085484475373241822d2 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 7 Feb 2023 06:11:44 +0000 Subject: [PATCH 28/37] Harmonise behaviour of BGMM and DBSCAN --- PopPUNK/bgmm.py | 29 +++++++++++++++++++++++++++-- PopPUNK/models.py | 3 ++- PopPUNK/plot.py | 3 ++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/PopPUNK/bgmm.py b/PopPUNK/bgmm.py index 92a001c6..b8db346d 100644 --- a/PopPUNK/bgmm.py +++ b/PopPUNK/bgmm.py @@ -45,10 +45,36 @@ def fit2dMultiGaussian(X, dpgmm_max_K = 2): return dpgmm +def findBetweenLabel_bgmm(means, assignments, rank = 0): + """Identify between-strain links + + Finds the component with the largest number of points + assigned to it + + Args: + means (numpy.array) + K x 2 array of mixture component means + assignments (numpy.array) + Sample cluster assignments + rank (int) + Which label to find, ordered by distance from origin. 0-indexed. + (default = 0) + + Returns: + between_label (int) + The cluster label with the most points assigned to it + """ + most_dists = {} + for mixture_component, distance in enumerate(np.apply_along_axis(np.linalg.norm, 1, means)): + most_dists[mixture_component] = np.count_nonzero(assignments == mixture_component) + + sorted_dists = sorted(most_dists.items(), key=operator.itemgetter(1), reverse=True) + return(sorted_dists[rank][0]) + def findWithinLabel(means, assignments, rank = 0): """Identify within-strain links - Finds the component with mean closest to the origin and also akes sure + Finds the component with mean closest to the origin and also makes sure some samples are assigned to it (in the case of small weighted components with a Dirichlet prior some components are unused) @@ -59,7 +85,6 @@ def findWithinLabel(means, assignments, rank = 0): Sample cluster assignments rank (int) Which label to find, ordered by distance from origin. 0-indexed. - (default = 0) Returns: diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 62c9620b..934598e2 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -53,6 +53,7 @@ # BGMM from .bgmm import fit2dMultiGaussian from .bgmm import findWithinLabel +from .bgmm import findBetweenLabel_bgmm from .bgmm import log_likelihood from .plot import plot_results from .plot import plot_contours @@ -328,7 +329,7 @@ def fit(self, X, max_components): y = self.assign(X) self.within_label = findWithinLabel(self.means, y) - self.between_label = findWithinLabel(self.means, y, 1) + self.between_label = findBetweenLabel_bgmm(self.means, y) return y diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 239b8768..8479f4e2 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -388,12 +388,13 @@ def plot_contours(model, assignments, title, out_prefix): # avoid recursive import from .bgmm import log_likelihood from .bgmm import findWithinLabel + from .bgmm import findBetweenLabel_bgmm xx, yy, xy = get_grid(0, 1, 100) # for likelihood boundary z = model.assign(xy, values=True, progress=False) - z_diff = z[:,findWithinLabel(model.means, assignments, 0)] - z[:,findWithinLabel(model.means, assignments, 1)] + z_diff = z[:,findWithinLabel(model.means, assignments, 0)] - z[:,findBetweenLabel_bgmm(model.means, assignments)] z = z_diff.reshape(xx.shape).T # For full likelihood surface From 3306f01fb7c1ca31b4f9b7b8b62b6b4303203434 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 9 Feb 2023 14:29:28 +0000 Subject: [PATCH 29/37] Enable alteration of unconstrained boundary search --- PopPUNK/refine.py | 4 ++-- PopPUNK/utils.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index aa0f9a00..5508f6cc 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -114,8 +114,8 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, raise RuntimeError("Unconstrained optimization and indiv-refine incompatible") global_grid_resolution = 20 - x_max_start, y_max_start = decisionBoundary(mean0, gradient) - x_max_end, y_max_end = decisionBoundary(mean1, gradient) + x_max_start, y_max_start = decisionBoundary(mean0, gradient, adj = -1*min_move) + x_max_end, y_max_end = decisionBoundary(mean1, gradient, adj = -max_move) if x_max_start < 0 or y_max_start < 0: raise RuntimeError("Boundary range below zero") diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index cae085cc..497d190d 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -519,7 +519,7 @@ def transformLine(s, mean0, mean1): return np.array([x, y]) -def decisionBoundary(intercept, gradient): +def decisionBoundary(intercept, gradient, adj = 0.0): """Returns the co-ordinates where the triangle the decision boundary forms meets the x- and y-axes. @@ -529,12 +529,19 @@ def decisionBoundary(intercept, gradient): which intercepts the boundary gradient (float) Gradient of the line + adj (float) + Distance by which to shift the interception point Returns: x (float) The x-axis intercept y (float) The y-axis intercept """ + if adj != 0.0: + original_hypotenuse = (intercept[0]**2 + intercept[1]**2)**0.5 + length_ratio = (original_hypotenuse + adj)/original_hypotenuse + intercept[0] = intercept[0] * length_ratio + intercept[1] = intercept[1] * length_ratio x = intercept[0] + intercept[1] * gradient y = intercept[1] + intercept[0] / gradient return(x, y) From bf4fcecbc4ade8e22be282e4ab20172d80f3f61d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 9 Feb 2023 16:32:46 +0000 Subject: [PATCH 30/37] Correct sign --- PopPUNK/refine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 5508f6cc..5bf2c402 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -115,7 +115,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, global_grid_resolution = 20 x_max_start, y_max_start = decisionBoundary(mean0, gradient, adj = -1*min_move) - x_max_end, y_max_end = decisionBoundary(mean1, gradient, adj = -max_move) + x_max_end, y_max_end = decisionBoundary(mean1, gradient, adj = max_move) if x_max_start < 0 or y_max_start < 0: raise RuntimeError("Boundary range below zero") From f5ec728c1a8718ee766fd3bdad69a7e03df2f198 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 29 Feb 2024 17:39:38 +0000 Subject: [PATCH 31/37] Update function call --- PopPUNK/refine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 45963b20..f256e0a1 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -369,7 +369,7 @@ def expand_cugraph_network(G, G_extra_df): if 'src' in G_original_df.columns: G_original_df.columns = ['source','destination'] G_df = cudf.concat([G_original_df,G_extra_df]) - G = add_self_loop(G_df, G_vertex_count, weights = False, renumber = False) + G = generate_cugraph(G_df, G_vertex_count, weights = False, renumber = False) return G def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, From e8cb660307febe6fd1c40a37372580b4618ca318 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 11 Mar 2024 14:17:05 +0000 Subject: [PATCH 32/37] Change network parsing --- PopPUNK/network.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 44519a58..64706e7c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -327,7 +327,10 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None # Make a graph of the component from the overall graph vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values - G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] + if 'src' in G_df.columns: + G_component_df = G_df[G_df['src'].isin(vertices_in_component) & G_df['dst'].isin(vertices_in_component)] + else: + G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) # Find single shortest path from a reference to all other nodes in the component From c4ba9d5c43fcfc4506f0234ec501aa500909fd96 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 12 Mar 2024 11:12:50 +0000 Subject: [PATCH 33/37] Update column names --- PopPUNK/network.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 64706e7c..e14ce1b1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -327,10 +327,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None # Make a graph of the component from the overall graph vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values - if 'src' in G_df.columns: - G_component_df = G_df[G_df['src'].isin(vertices_in_component) & G_df['dst'].isin(vertices_in_component)] - else: - G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] + G_component_df = G_df[G_df['old_source'].isin(vertices_in_component) & G_df['old_destination'].isin(vertices_in_component)] G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) # Find single shortest path from a reference to all other nodes in the component From 3c4a9cb121462208b9d21c1894679ab4b5693a6d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Tue, 12 Mar 2024 12:25:10 +0000 Subject: [PATCH 34/37] Rename columns --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e14ce1b1..eb9d993d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -329,7 +329,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values G_component_df = G_df[G_df['old_source'].isin(vertices_in_component) & G_df['old_destination'].isin(vertices_in_component)] G_component = cugraph.Graph() - G_component.from_cudf_edgelist(G_component_df) + G_component.from_cudf_edgelist(G_component_df.rename(columns={'old_source': 'src','old_destination': 'dst'}, inplace=True)) # Find single shortest path from a reference to all other nodes in the component traversal = cugraph.traversal.sssp(G_component,source = references_in_component[0]) reference_index_set = set(reference_indices) From ec4f8c10cb4cf12192aa00573005e4314a276073 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 13 Mar 2024 09:10:21 +0000 Subject: [PATCH 35/37] Fix component graph construction --- PopPUNK/network.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index eb9d993d..c0ae9a14 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -329,9 +329,10 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values G_component_df = G_df[G_df['old_source'].isin(vertices_in_component) & G_df['old_destination'].isin(vertices_in_component)] G_component = cugraph.Graph() - G_component.from_cudf_edgelist(G_component_df.rename(columns={'old_source': 'src','old_destination': 'dst'}, inplace=True)) + G_component_df.rename(columns={'old_source': 'source', 'old_destination': 'destination'}, inplace=True) + G_component.from_cudf_edgelist(G_component_df) # Find single shortest path from a reference to all other nodes in the component - traversal = cugraph.traversal.sssp(G_component,source = references_in_component[0]) + traversal = cugraph.traversal.bfs(G_component,source = references_in_component[0]) reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values From 986a43d44e2dc68e96ce2dc1471840b1abfb6dc7 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 13 Mar 2024 10:09:19 +0000 Subject: [PATCH 36/37] Fix BFS arg --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c0ae9a14..34930a21 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -332,7 +332,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None G_component_df.rename(columns={'old_source': 'source', 'old_destination': 'destination'}, inplace=True) G_component.from_cudf_edgelist(G_component_df) # Find single shortest path from a reference to all other nodes in the component - traversal = cugraph.traversal.bfs(G_component,source = references_in_component[0]) + traversal = cugraph.traversal.bfs(G_component,start = references_in_component[0]) reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values From 5e63f2e30a13c61d0e682356dd68f29724a5b1ff Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 24 May 2024 06:29:00 +0100 Subject: [PATCH 37/37] Enable compatibility with CPU-only mandrake --- PopPUNK/mandrake.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/PopPUNK/mandrake.py b/PopPUNK/mandrake.py index 2d14a350..fc9e81f5 100644 --- a/PopPUNK/mandrake.py +++ b/PopPUNK/mandrake.py @@ -70,23 +70,29 @@ def generate_embedding(seqLabels, accMat, perplexity, outPrefix, overwrite, kNN weights = np.ones((len(seqLabels))) random.Random() seed = random.randint(0, 2**32) - if use_gpu and gpu_fn_available: - sys.stderr.write("Running on GPU\n") - n_workers = 65536 - maxIter = round(maxIter / n_workers) - wtsne_call = partial(wtsne_gpu_fp64, - perplexity=perplexity, - maxIter=maxIter, - blockSize=128, - n_workers=n_workers, - nRepuSamp=5, - eta0=1, - bInit=0, - animated=False, - cpu_threads=n_threads, - device_id=device_id, - seed=seed) - else: + gpu_analysis_complete = False + try: + if use_gpu and gpu_fn_available: + sys.stderr.write("Running on GPU\n") + n_workers = 65536 + maxIter = round(maxIter / n_workers) + wtsne_call = partial(wtsne_gpu_fp64, + perplexity=perplexity, + maxIter=maxIter, + blockSize=128, + n_workers=n_workers, + nRepuSamp=5, + eta0=1, + bInit=0, + animated=False, + cpu_threads=n_threads, + device_id=device_id, + seed=seed) + gpu_analysis_complete = True + except: + # If installed through conda/mamba mandrake is not GPU-enabled by default + sys.stderr.write('Mandrake analysis with GPU failed; trying with CPU\n') + if not gpu_analysis_complete: sys.stderr.write("Running on CPU\n") maxIter = round(maxIter / n_threads) wtsne_call = partial(wtsne,