diff --git a/CHANGES.md b/CHANGES.md index c7efba787..f7846af1a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,13 +11,16 @@ ### Bug Fixes +* filter: Previously, when `--subsample-max-sequences` was slightly lower than the number of groups, it was possible to fail with an uncaught `AssertionError`. Internal calculations have been adjusted to prevent this from happening. [#1588][] [#1598][] (@victorlin) * filter: Improved warning and error messages in the case of missing columns. [#1604] (@victorlin) * merge: Any user-customized `~/.sqliterc` file is now ignored so it doesn't break `augur merge`'s internal use of SQLite. [#1608][] (@tsibley) * merge: Non-id columns in metadata inputs that would conflict with the output id column are now forbidden and will cause an error if present. Previously they would overwrite values in the output id column, causing incorrect output. [#1593][] (@tsibley) * import: Spaces in BEAST MCC tree annotations (for example, from a discrete state reconstruction) no longer break `augur import beast`'s parsing. [#1610][] (@watronfire) +[#1588]: https://github.com/nextstrain/augur/issues/1588 [#1593]: https://github.com/nextstrain/augur/pull/1593 [#1594]: https://github.com/nextstrain/augur/pull/1594 +[#1598]: https://github.com/nextstrain/augur/issues/1598 [#1604]: https://github.com/nextstrain/augur/pull/1604 [#1608]: https://github.com/nextstrain/augur/pull/1608 [#1610]: https://github.com/nextstrain/augur/pull/1610 diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py index 11f2eef86..277398915 100644 --- a/augur/filter/subsample.py +++ b/augur/filter/subsample.py @@ -548,10 +548,7 @@ def calculate_sequences_per_group(target_max_value, group_sizes, allow_probabili except TooManyGroupsError as error: if allow_probabilistic: print_err(f"WARNING: {error}") - sequences_per_group = _calculate_fractional_sequences_per_group( - target_max_value, - group_sizes, - ) + sequences_per_group = target_max_value / len(group_sizes) probabilistic_used = True else: raise error @@ -634,52 +631,3 @@ def _calculate_sequences_per_group( return int(hi) else: return int(lo) - - -def _calculate_fractional_sequences_per_group( - target_max_value: int, - group_sizes: Collection[int] -) -> float: - """Returns the fractional sequences per group for the given list of group - sequences such that the total doesn't exceed the requested number of - samples. - - Parameters - ---------- - target_max_value : int - the total number of sequences allowed across all groups - group_sizes : Collection[int] - the number of sequences in each group - - Returns - ------- - float - fractional maximum number of sequences allowed per group to meet the - required maximum total sequences allowed - - Examples - -------- - >>> np.around(_calculate_fractional_sequences_per_group(4, [4, 2]), 4) - 1.9375 - >>> np.around(_calculate_fractional_sequences_per_group(2, [4, 2]), 4) - 0.9688 - - Unlike the integer-based version of this function, the fractional version - can accept a maximum number of sequences that exceeds the number of groups. - In this case, the function returns a fraction that can be used downstream, - for example with Poisson sampling. - - >>> np.around(_calculate_fractional_sequences_per_group(1, [4, 2]), 4) - 0.4844 - """ - lo = 1e-5 - hi = float(target_max_value) - - while (hi / lo) > 1.1: - mid = (lo + hi) / 2 - if _calculate_total_sequences(mid, group_sizes) <= target_max_value: - lo = mid - else: - hi = mid - - return (lo + hi) / 2 diff --git a/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t b/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t index 72ed589f2..7939bbc0c 100644 --- a/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t +++ b/tests/functional/filter/cram/subsample-max-sequences-with-probabilistic-sampling-warning.t @@ -15,7 +15,7 @@ Explicitly use probabilistic subsampling to handle the case when there are more > --probabilistic-sampling \ > --output-strains filtered_strains_probabilistic.txt > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 8 groups. - Sampling probabilistically at 0.6055 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + Sampling probabilistically at 0.6250 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. 10 strains were dropped during filtering 1 had no metadata 1 had no sequence data @@ -36,7 +36,7 @@ Using the default probabilistic subsampling, should work the same as the previou > --subsample-seed 314159 \ > --output-strains filtered_strains_default.txt > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 8 groups. - Sampling probabilistically at 0.6055 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + Sampling probabilistically at 0.6250 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. 10 strains were dropped during filtering 1 had no metadata 1 had no sequence data diff --git a/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t b/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t index c908988fa..865204107 100644 --- a/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t +++ b/tests/functional/filter/cram/subsample-probabilistic-sampling-output.t @@ -12,7 +12,7 @@ Check output of probabilistic sampling. > --subsample-seed 314159 \ > --output-metadata filtered_metadata.tsv WARNING: Asked to provide at most 3 sequences, but there are 8 groups. - Sampling probabilistically at 0.3633 sequences per group, meaning it is possible to have more than the requested maximum of 3 sequences after filtering. + Sampling probabilistically at 0.3750 sequences per group, meaning it is possible to have more than the requested maximum of 3 sequences after filtering. 10 strains were dropped during filtering 1 was dropped during grouping due to ambiguous year information 1 was dropped during grouping due to ambiguous month information diff --git a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t index b638bdd59..dec6bedad 100644 --- a/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t +++ b/tests/functional/filter/cram/subsample-skip-ambiguous-dates.t @@ -13,7 +13,7 @@ Strains with ambiguous years or months should be dropped and logged. > --output-strains filtered_strains.txt \ > --output-log filtered_log.tsv > /dev/null WARNING: Asked to provide at most 5 sequences, but there are 6 groups. - Sampling probabilistically at 0.8203 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. + Sampling probabilistically at 0.8333 sequences per group, meaning it is possible to have more than the requested maximum of 5 sequences after filtering. 8 strains were dropped during filtering 1 was dropped during grouping due to ambiguous year information 1 was dropped during grouping due to ambiguous month information