Skip to content

Commit

Permalink
Move database processing function into sketchlib.py
Browse files Browse the repository at this point in the history
  • Loading branch information
nickjcroucher committed Oct 8, 2024
1 parent a1691af commit 7d9e17d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 15 deletions.
8 changes: 5 additions & 3 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def main():

# Imports are here because graph tool is very slow to load
from .models import loadClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
from .sketchlib import checkSketchlibLibrary, removeFromDB
from .sketchlib import checkSketchlibLibrary, removeFromDB, get_database_statistics

from .network import construct_network_from_edge_list
from .network import construct_network_from_assignments
Expand Down Expand Up @@ -393,7 +393,8 @@ def main():
plot_scatter(distMat,
args.output,
args.output + " distances")
plot_database_evaluations(args.output)
genome_lengths, ambiguous_bases = get_database_statistics(args.output)
plot_database_evaluations(genome_lengths, ambiguous_bases)

#******************************#
#* *#
Expand Down Expand Up @@ -471,7 +472,8 @@ def main():
plot_scatter(distMat,
output,
output + " distances")
plot_database_evaluations(output)
genome_lengths, ambiguous_bases = get_database_statistics(args.output)
plot_database_evaluations(genome_lengths, ambiguous_bases)

#******************************#
#* *#
Expand Down
17 changes: 5 additions & 12 deletions PopPUNK/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# for other outputs
import pandas as pd
from pandas.errors import DataError
import h5py
from collections import defaultdict
from sklearn import utils
try: # sklearn >= 0.22
Expand Down Expand Up @@ -82,21 +81,15 @@ def plot_scatter(X, out_prefix, title, kde = True):
plt.savefig(os.path.join(out_prefix, os.path.basename(out_prefix) + '_distanceDistribution.png'))
plt.close()

def plot_database_evaluations(prefix):
def plot_database_evaluations(genome_lengths, ambiguous_bases):
"""Plot histograms of sequence characteristics for database evaluation.
Args:
prefix (str)
Prefix of database
genome_lengths (list)
Lengths of genomes in database
ambiguous_bases (list)
Counts of ambiguous bases in genomes in database
"""
db_file = prefix + "/" + os.path.basename(prefix) + ".h5"
ref_db = h5py.File(db_file, 'r')

genome_lengths = []
ambiguous_bases = []
for sample_name in list(ref_db['sketches'].keys()):
genome_lengths.append(ref_db['sketches/' + sample_name].attrs['length'])
ambiguous_bases.append(ref_db['sketches/' + sample_name].attrs['missing_bases'])
plot_evaluation_histogram(genome_lengths,
n_bins = 100,
prefix = prefix,
Expand Down
18 changes: 18 additions & 0 deletions PopPUNK/sketchlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,3 +659,21 @@ def fitKmerCurve(pairwise, klist, jacobian):

# Return core, accessory
return(np.flipud(transformed_params))

def plot_database_evaluations(prefix):
"""Extract statistics for evaluating databases.
Args:
prefix (str)
Prefix of database
"""
db_file = prefix + "/" + os.path.basename(prefix) + ".h5"
ref_db = h5py.File(db_file, 'r')

genome_lengths = []
ambiguous_bases = []
for sample_name in list(ref_db['sketches'].keys()):
genome_lengths.append(ref_db['sketches/' + sample_name].attrs['length'])
ambiguous_bases.append(ref_db['sketches/' + sample_name].attrs['missing_bases'])

return genome_lengths, ambiguous_bases

0 comments on commit 7d9e17d

Please sign in to comment.