Skip to content

Commit

Permalink
Resolve conflicts with master
Browse files Browse the repository at this point in the history
  • Loading branch information
nickjcroucher committed Feb 29, 2024
2 parents 5652532 + 27e7f85 commit 025a818
Show file tree
Hide file tree
Showing 20 changed files with 478 additions and 146 deletions.
20 changes: 20 additions & 0 deletions .github/workflows/docs_pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: "Pull Request Docs Check"

on: [pull_request]

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1

- uses: ammaraskar/sphinx-action@master
with:
docs-folder: "docs/"

- uses: actions/upload-artifact@v1
with:
name: DocumentationHTML
path: docs/_build/html/
42 changes: 42 additions & 0 deletions .github/workflows/docs_push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: "Build and publish docs"

on:
push:
branches:
- master

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1

- uses: ammaraskar/sphinx-action@master
with:
docs-folder: "docs/"

- uses: actions/upload-pages-artifact@v2
with:
name: github-pages
path: docs/_build/html/
retention-days: 1

deploy:
permissions:
contents: read
pages: write
id-token: write

needs: build

environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}

runs-on: ubuntu-latest
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v3
9 changes: 7 additions & 2 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
# Required
version: 2

# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.11"

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
Expand All @@ -14,8 +20,7 @@ formats: all

# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.8
install:
- requirements: docs/requirements.txt
- method: setuptools
path: docs
path: docs
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.6.1'
__version__ = '2.6.4'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 2
Expand Down
57 changes: 48 additions & 9 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,34 @@ def get_options():

# model fitting
modelGroup = parser.add_argument_group('Model fit options')
modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
modelGroup.add_argument('--model-subsample',
help='Number of pairwise distances used to fit model [default = 100000]',
type=int,
default=100000)
modelGroup.add_argument('--assign-subsample',
help='Number of pairwise distances in each assignment batch [default = 5000]',
type=int,
default=5000)
modelGroup.add_argument('--for-refine',
help='Fit a BGMM or DBSCAN model without assigning all points to initialise a refined model',
default=False,
action='store_true')
modelGroup.add_argument('--K',
help='Maximum number of mixture components [default = 2]',
type=int,
default=2)
modelGroup.add_argument('--D',
help='Maximum number of clusters in DBSCAN fitting [default = 100]',
type=int,
default=100)
modelGroup.add_argument('--min-cluster-prop',
help='Minimum proportion of points in a cluster '
'in DBSCAN fitting [default = 0.0001]',
type=float,
default=0.0001)
modelGroup.add_argument('--threshold',
help='Cutoff if using --fit-model threshold',
type=float)

# model refinement
refinementGroup = parser.add_argument_group('Network analysis and model refinement options')
Expand Down Expand Up @@ -183,6 +206,7 @@ def get_options():
other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
other.add_argument('--gpu-model', default=False, action='store_true', help='Use a GPU when fitting a model [default = False]')
other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets',
Expand Down Expand Up @@ -501,14 +525,24 @@ def main():
if args.fit_model:
# Run DBSCAN model
if args.fit_model == "dbscan":
model = DBSCANFit(output)
model = DBSCANFit(output,
max_samples = args.model_subsample,
max_batch_size = args.assign_subsample,
assign_points = not args.for_refine)
model.set_threads(args.threads)
assignments = model.fit(distMat, args.D, args.min_cluster_prop)
assignments = model.fit(distMat,
args.D,
args.min_cluster_prop,
args.gpu_model)
# Run Gaussian model
elif args.fit_model == "bgmm":
model = BGMMFit(output)
model = BGMMFit(output,
max_samples = args.model_subsample,
max_batch_size = args.assign_subsample,
assign_points = not args.for_refine)
model.set_threads(args.threads)
assignments = model.fit(distMat, args.K)
assignments = model.fit(distMat,
args.K)
elif args.fit_model == "refine":
new_model = RefineFit(output)
new_model.set_threads(args.threads)
Expand Down Expand Up @@ -568,6 +602,11 @@ def main():
else:
assignments = model.assign(distMat)

# end here if not assigning data
if args.for_refine:
sys.stderr.write('Initial model fit complete; points will be assigned when this model is refined\nusing "--fit-model refine"\n')
sys.exit(0)

#******************************#
#* *#
#* network construction *#
Expand Down
7 changes: 6 additions & 1 deletion PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,11 @@ def assign_query_hdf5(dbFuncs,
raise RuntimeError("lineage models cannot be used with --serial")
model.set_threads(threads)

# Only proceed with a fully-fitted model
if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False):
sys.stderr.write('Cannot assign points with an incompletely-fitted model\nPlease refine this initial fit with "--fit-model refine"\n')
sys.exit(1)

# Set directories of previous fit
if previous_clustering is not None:
prev_clustering = previous_clustering
Expand Down Expand Up @@ -753,7 +758,7 @@ def assign_query_hdf5(dbFuncs,
postpruning_combined_seq, newDistMat = \
prune_distance_matrix(combined_seq, names_to_remove, complete_distMat,
output + "/" + os.path.basename(output) + dists_suffix)
graph_suffix = file_extension_string + '_refs_graph'
graph_suffix = file_extension_string + '.refs_graph'
save_network(genomeNetwork,
prefix = output,
suffix = graph_suffix,
Expand Down
44 changes: 31 additions & 13 deletions PopPUNK/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
# hdbscan
import hdbscan

def fitDbScan(X, min_samples, min_cluster_size, cache_out):
from .utils import check_and_set_gpu

def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
"""Function to fit DBSCAN model as an alternative to the Gaussian
Fits the DBSCAN model to the distances using hdbscan
Expand All @@ -23,26 +25,42 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out):
Minimum number of points in a cluster for HDBSCAN
cache_out (str)
Prefix for DBSCAN cache used for refitting
use_gpu (bool)
Whether GPU algorithms should be used in DBSCAN fitting
Returns:
hdb (hdbscan.HDBSCAN)
hdb (hdbscan.HDBSCAN or cuml.cluster.HDBSCAN)
Fitted HDBSCAN to subsampled data
labels (list)
Cluster assignments of each sample
n_clusters (int)
Number of clusters used
"""
# set DBSCAN clustering parameters
hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
min_samples = min_samples,
#core_dist_n_jobs = threads, # may cause error, see #19
memory = cache_out,
prediction_data = True,
min_cluster_size = min_cluster_size
).fit(X)
# Number of clusters in labels, ignoring noise if present.
labels = hdb.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
if use_gpu:
from cuml import cluster
import cupy as cp
sys.stderr.write('Fitting HDBSCAN model using a GPU\n')
hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples,
output_type = 'cupy',
prediction_data = True,
min_cluster_size = min_cluster_size
).fit(X)
# Number of clusters in labels, ignoring noise if present.
labels = hdb.labels_
n_clusters = len(cp.unique(labels[labels>-1]))
else:
sys.stderr.write('Fitting HDBSCAN model using a CPU\n')
hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
min_samples = min_samples,
#core_dist_n_jobs = threads, # may cause error, see #19
memory = cache_out,
prediction_data = True,
min_cluster_size = min_cluster_size
).fit(X)
# Number of clusters in labels, ignoring noise if present.
labels = hdb.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

# return model parameters
return hdb, labels, n_clusters
Expand Down Expand Up @@ -70,7 +88,7 @@ def evaluate_dbscan_clusters(model):

# evaluate whether maxima of cluster nearest origin do not
# overlap with minima of cluster furthest from origin
if core_minimum_of_between > core_maximum_of_within and \
if core_minimum_of_between > core_maximum_of_within or \
accessory_minimum_of_between > accessory_maximum_of_within:
indistinct = False

Expand Down
Loading

0 comments on commit 025a818

Please sign in to comment.