diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index c5a5c40..5e55163 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -8,3 +8,5 @@ jobs: - uses: actions/checkout@v4 - name: Run clang-format style check for C/C++/Protobuf programs. uses: jidicula/clang-format-action@v4.11.0 + with: + clang-format-version: '17' diff --git a/README.md b/README.md index 1299f25..8032114 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Digest C++ library which supports various minimizer schemes for digestion of DNA sequences -# Implementation (Most of the documentation is in the code) +# Implementation Supports Mod Minimizers, Window Minimizers, and Syncmers Uses the cyclic or hash provided by [ntHash](https://github.com/bcgsc/ntHash). For now I just downloaded the essential files off their github and compiled it myself but I may change how I link in ntHash in the future. @@ -28,11 +28,8 @@ This will generate `include` and `lib` folders. # Usage [Documentation](https://veryamazed.github.io/digest/) -* Headers at `#include ` -* Classes are in `digest` namespace -* example compile: `g++ file.cpp -IPREFIX/include -LPREFIX/lib -ldigest` -* may need `std=c++17` -* ntHash does not support `large_window < 4` +* Digest objects require that the input string is kept in memory, unmodified. +* requires `c++17` # Example ```cpp @@ -44,22 +41,11 @@ Example snippet to collect up to 100000 indices of minimizers. A vector must be passed in, which will be appended to. Each WindowMin / Syncmer object is templated by the algorithm / data structure to find minimizers. -# Selecting the correct `data_structure` -our general guidelines: -* for `large_window` < 12, use Naive -* for 12 <= `large_window` <= 16 use SegmentTree -* for `large_window` > 16 use Naive2 - -adaptive performs at worst about 10% slower than best -adaptive64 performs at worst about 100% slower than best +A complete example and cli can be found [here](https://github.com/BenLangmead/gester/tree/main) # Contributing -run -```bash -ninja clang-format -ninja clang-tidy -ninja docs -``` +Use clang format version 17. +run `ninja clang-format` before submitting a PR. # Benchmark / Tests ```bash @@ -67,5 +53,3 @@ meson setup build cd build && meson compile ``` this will generate proper executables for benchmark/testing - -add to forked repo diff --git a/docs/Doxyfile b/docs/Doxyfile index 64e88c0..820a7ae 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -588,7 +588,7 @@ RESOLVE_UNNAMED_PARAMS = YES # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. -HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_MEMBERS = YES # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set @@ -949,7 +949,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = include/digest +INPUT = README.md include/digest # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1054,7 +1054,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # ./include/digest/data_structure.hpp # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -1165,7 +1165,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -USE_MDFILE_AS_MAINPAGE = +USE_MDFILE_AS_MAINPAGE = README.md # The Fortran standard specifies that for fixed formatted Fortran code all # characters from position 72 are to be considered as comment. A common diff --git a/include/digest/data_structure.hpp b/include/digest/data_structure.hpp index ddb1bcf..34f097b 100644 --- a/include/digest/data_structure.hpp +++ b/include/digest/data_structure.hpp @@ -1,396 +1,463 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -// requirement on all data_structures -// constructor which accepts uint32_t -// void set(uint32_t index, (uint 32/64) hash) -// uint32_t min() // returns minimum -// pair min_hash() // returns index, hash -// void min_syncmer(vector &vec) // appends minimum if syncmer -// void min_syncmer(vector> &vec) // appends (left -// syncmer index, right syncmer index) if syncmer assignment/copy constructors -// if you want to use them - -namespace digest::ds { -// Based on a template taken from USACO.guide and then modified by me (for -// competitive programming), and now modified again (for this) -// https://usaco.guide/gold/PURS?lang=cpp -// https://codeforces.com/blog/entry/18051 (USACO.guide was probably heavily -// inspired by this) -/** A data structure that can answer point update & range minimum queries. */ -template struct SegmentTree { - int i = k; - std::array segtree = {}; - - constexpr int log2() { return std::ceil(std::log2(k)); } - - SegmentTree(uint32_t) {} - SegmentTree(const SegmentTree &other) = default; - SegmentTree &operator=(const SegmentTree &other) = default; - - void insert(uint32_t index, uint32_t hash) { - int ind = i; - if (++i == 2 * k) - i = k; - - // negate so we can use max so that ties are broken by rightmost - segtree[ind] = (uint64_t)~hash << 32 | index; - for (int rep = 0; rep < log2(); rep++) { - segtree[ind >> 1] = std::max(segtree[ind], segtree[ind ^ 1]); - ind >>= 1; - } - } - - uint32_t min() { return segtree[1]; } - - uint32_t min_hash() { return ~(segtree[1] >> 32); } - - void min_syncmer(std::vector &vec) { - if (segtree[1] >> 32 == - std::max(uint32_t(segtree[i] >> 32), - uint32_t(segtree[i == k ? 2 * k - 1 : i - 1] >> 32))) { - vec.emplace_back(segtree[i]); - } - } - - void min_syncmer(std::vector> &vec) { - if (segtree[1] >> 32 == - std::max(uint32_t(segtree[i] >> 32), - uint32_t(segtree[i == k ? 2 * k - 1 : i - 1] >> 32))) { - vec.emplace_back(segtree[i], ~(segtree[1] >> 32)); - } - } -}; - -template struct Naive { - std::array arr; - unsigned int i = 0; - - Naive(uint32_t){}; - Naive(const Naive &other) = default; - Naive &operator=(const Naive &other) = default; - - void insert(uint32_t index, uint32_t hash) { - arr[i] = (uint64_t)~hash << 32 | index; - if (++i == k) - i = 0; - } - - uint32_t min() { - int i = k - 1; - for (int j = k - 2; j >= 0; j--) { - if (arr[j] > arr[i]) { - i = j; - } - } - return arr[i]; - } - - uint32_t min_hash() { - int i = k - 1; - for (int j = k - 2; j >= 0; j--) { - if (arr[j] > arr[i]) { - i = j; - } - } - return ~(uint32_t)(arr[i] >> 32); - } - - void min_syncmer(std::vector &vec) { - unsigned int j = 0; - for (unsigned int l = 1; l < k; l++) { - if (arr[l] > arr[j]) { - j = l; - } - } - if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i]); - } - } - - void min_syncmer(std::vector> &vec) { - unsigned int j = k - 1; - for (int l = k - 2; l >= 0; l--) { - if (arr[l] > arr[j]) { - j = l; - } - } - if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i], ~(uint32_t)(arr[j] >> 32)); - } - } -}; - -template struct Naive2 { - unsigned int i = 0; - unsigned int last = 0; - std::vector arr = std::vector(k); - - Naive2(uint32_t){}; - Naive2(const Naive2 &other) = default; - Naive2 &operator=(const Naive2 &other) = default; - - void insert(uint32_t index, uint32_t hash) { - // flip the hash bits so we can take the maximum - arr[i] = (uint64_t)~hash << 32 | index; - - if (arr[i] > arr[last]) { - last = i; - } else if (last == i) { - for (unsigned j = 0; j < k; j++) { - if (arr[j] > arr[last]) { - last = j; - } - } - } - - if (++i == k) - i = 0; - } - - uint32_t min() { return arr[last]; } - - uint32_t min_hash() { return ~(uint32_t)(arr[last] >> 32); } - - void min_syncmer(std::vector &vec) { - if (arr[last] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i]); - } - } - - void min_syncmer(std::vector> &vec) { - if (arr[last] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i], ~(uint32_t)(arr[last] >> 32)); - } - } -}; - -struct Adaptive { - uint32_t k, i = 0, last = 0; - std::vector arr; - - Adaptive(uint32_t k) : k(k), arr(k) {} - Adaptive(const Adaptive &other) = default; - Adaptive &operator=(const Adaptive &other) = default; - - void naive(uint32_t index, uint32_t hash) { - arr[i] = (uint64_t)~hash << 32 | index; - if (++i == k) - i = 0; - } - - void naive2(uint32_t index, uint32_t hash) { - // flip the hash bits so we can take the maximum - arr[i] = (uint64_t)~hash << 32 | index; - - if (arr[i] > arr[last]) { - last = i; - } else if (last == i) { - for (unsigned j = 0; j < k; j++) { - if (arr[j] > arr[last]) { - last = j; - } - } - } - - if (++i == k) - i = 0; - } - - void insert(uint32_t index, uint32_t hash) { - if (k < 16) { - naive(index, hash); - } else { - naive2(index, hash); - } - } - - uint32_t min() { - if (k < 16) { - int i = k - 1; - for (int j = k - 2; j >= 0; j--) { - if (arr[j] > arr[i]) { - i = j; - } - } - return arr[i]; - } else { - return arr[last]; - } - } - - uint32_t min_hash() { - if (k < 16) { - int i = k - 1; - for (int j = k - 2; j >= 0; j--) { - if (arr[j] > arr[i]) { - i = j; - } - } - return ~(uint32_t)(arr[i] >> 32); - } else { - return ~(uint32_t)(arr[last] >> 32); - } - } - - void min_syncmer(std::vector &vec) { - if (k < 16) { - unsigned int j = k - 1; - for (int l = k - 2; l >= 0; l--) { - if (arr[l] > arr[j]) { - j = l; - } - } - if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i]); - } - } else { - if (arr[last] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i]); - } - } - } - - void min_syncmer(std::vector> &vec) { - if (k < 16) { - unsigned int j = k - 1; - for (int l = k - 2; l >= 0; l--) { - if (arr[l] > arr[j]) { - j = l; - } - } - if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i], ~(uint32_t)(arr[j] >> 32)); - } - } else { - if (arr[last] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i], ~(uint32_t)(arr[last] >> 32)); - } - } - } -}; - -struct Adaptive64 { - uint32_t k, i = 0, last = 0; - std::vector<__uint128_t> arr; - - Adaptive64(uint32_t k) : k(k), arr(k) {} - Adaptive64(const Adaptive64 &other) = default; - Adaptive64 &operator=(const Adaptive64 &other) = default; - - void naive(uint32_t index, uint64_t hash) { - arr[i] = (__uint128_t)~hash << 32 | index; - if (++i == k) - i = 0; - } - - void naive2(uint32_t index, uint64_t hash) { - // flip the hash bits so we can take the maximum - arr[i] = (__uint128_t)~hash << 32 | index; - - if (arr[i] > arr[last]) { - last = i; - } else if (last == i) { - for (int j = k - 1; j >= 0; j--) { - if (arr[j] > arr[last]) { - last = j; - } - } - } - - if (++i == k) - i = 0; - } - - void insert(uint32_t index, uint64_t hash) { - if (k < 16) { - naive(index, hash); - } else { - return naive2(index, hash); - } - } - - uint32_t min() { - if (k < 16) { - int i = k - 1; - for (int j = k - 2; j >= 0; j--) { - if (arr[j] > arr[i]) { - i = j; - } - } - return arr[i]; - } else { - return arr[last]; - } - } - - uint64_t min_hash() { - if (k < 16) { - int i = k - 1; - for (int j = k - 2; j >= 0; j--) { - if (arr[j] > arr[i]) { - i = j; - } - } - return ~(uint64_t)(arr[i] >> 32); - } else { - return ~(uint64_t)(arr[last] >> 32); - } - } - - void min_syncmer(std::vector &vec) { - if (k < 16) { - unsigned int j = k - 1; - for (int l = k - 2; l >= 0; l--) { - if (arr[l] > arr[j]) { - j = l; - } - } - if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i]); - } - } else { - if (arr[last] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i]); - } - } - } - - void min_syncmer(std::vector> &vec) { - if (k < 16) { - unsigned int j = k - 1; - for (int l = k - 2; l >= 0; l--) { - if (arr[l] > arr[j]) { - j = l; - } - } - if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i], ~(uint64_t)(arr[j] >> 32)); - } - } else { - if (arr[last] >> 32 == std::max(uint32_t(arr[i] >> 32), - uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { - vec.emplace_back(arr[i], ~(uint64_t)(arr[last] >> 32)); - } - } - } -}; -} // namespace digest::ds +#ifndef DATA_STRUCTURE_HPP +#define DATA_STRUCTURE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Data structures for minimum hash queries on a window. + * ntHash does not support `large_window < 4` + * + * Selecting the correct `data_structure` + * our general guidelines: + * * for `large_window` < 12, use Naive + * * for 12 <= `large_window` <= 16 use SegmentTree + * * for `large_window` > 16 use Naive2 + * + * Adaptive performs at worst about 10% slower than best + * Adaptive64 performs at worst about 100% slower than best + */ + +namespace digest::ds { + +/** + * All data_structures must follow this interface. Add the min_syncmer functions + * for syncmer support. + */ +template struct Interface { + static_assert(std::is_same() || std::is_same(), + "T must be either uint32_t or uint64_t"); + + /** constructor must accept uint32_t large_window */ + Interface(uint32_t); + + /** returns the index of the minimum hash */ + virtual uint32_t min(); + /** returns the minimum hash */ + virtual T min_hash(); + + /** appends minimum if syncmer */ + virtual void min_syncmer(std::vector &vec); + /** appends (left syncmer index, right syncmer index) */ + virtual void min_syncmer(std::vector> &vec); +}; + +// Based on a template taken from USACO.guide and then modified by me (for +// competitive programming), and now modified again (for this) +// https://usaco.guide/gold/PURS?lang=cpp +// https://codeforces.com/blog/entry/18051 (USACO.guide was probably heavily +// inspired by this) +/** A data structure that can answer point update & range minimum queries. */ + +/** + * @brief Segment Tree data structure. Supports log(n) point updates and range + * minimum queries. + * + * @tparam k large window size + */ +template struct SegmentTree { + int i = k; + std::array segtree = {}; + + constexpr int log2() { return std::ceil(std::log2(k)); } + + SegmentTree(uint32_t) {} + SegmentTree(const SegmentTree &other) = default; + SegmentTree &operator=(const SegmentTree &other) = default; + + void insert(uint32_t index, uint32_t hash) { + int ind = i; + if (++i == 2 * k) + i = k; + + // negate so we can use max so that ties are broken by rightmost + segtree[ind] = (uint64_t)~hash << 32 | index; + for (int rep = 0; rep < log2(); rep++) { + segtree[ind >> 1] = std::max(segtree[ind], segtree[ind ^ 1]); + ind >>= 1; + } + } + + uint32_t min() { return segtree[1]; } + + uint32_t min_hash() { return ~(segtree[1] >> 32); } + + void min_syncmer(std::vector &vec) { + if (segtree[1] >> 32 == + std::max(uint32_t(segtree[i] >> 32), + uint32_t(segtree[i == k ? 2 * k - 1 : i - 1] >> 32))) { + vec.emplace_back(segtree[i]); + } + } + + void min_syncmer(std::vector> &vec) { + if (segtree[1] >> 32 == + std::max(uint32_t(segtree[i] >> 32), + uint32_t(segtree[i == k ? 2 * k - 1 : i - 1] >> 32))) { + vec.emplace_back(segtree[i], ~(segtree[1] >> 32)); + } + } +}; + +/** + * @brief Naive data structure. Naively loops through the array to find the + * minimum. + * + * @tparam k large window size + */ +template struct Naive { + std::array arr; + unsigned int i = 0; + + Naive(uint32_t){}; + Naive(const Naive &other) = default; + Naive &operator=(const Naive &other) = default; + + void insert(uint32_t index, uint32_t hash) { + arr[i] = (uint64_t)~hash << 32 | index; + if (++i == k) + i = 0; + } + + uint32_t min() { + int i = k - 1; + for (int j = k - 2; j >= 0; j--) { + if (arr[j] > arr[i]) { + i = j; + } + } + return arr[i]; + } + + uint32_t min_hash() { + int i = k - 1; + for (int j = k - 2; j >= 0; j--) { + if (arr[j] > arr[i]) { + i = j; + } + } + return ~(uint32_t)(arr[i] >> 32); + } + + void min_syncmer(std::vector &vec) { + unsigned int j = 0; + for (unsigned int l = 1; l < k; l++) { + if (arr[l] > arr[j]) { + j = l; + } + } + if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i]); + } + } + + void min_syncmer(std::vector> &vec) { + unsigned int j = k - 1; + for (int l = k - 2; l >= 0; l--) { + if (arr[l] > arr[j]) { + j = l; + } + } + if (arr[j] >> 32 == std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i], ~(uint32_t)(arr[j] >> 32)); + } + } +}; + +/** + * @brief Naive2 data structure. Remembers the last minimum index and only loops + * through the array when this index leaves the window. + * + * @tparam k large window size + */ +template struct Naive2 { + unsigned int i = 0; + unsigned int last = 0; + std::vector arr = std::vector(k); + + Naive2(uint32_t){}; + Naive2(const Naive2 &other) = default; + Naive2 &operator=(const Naive2 &other) = default; + + void insert(uint32_t index, uint32_t hash) { + // flip the hash bits so we can take the maximum + arr[i] = (uint64_t)~hash << 32 | index; + + if (arr[i] > arr[last]) { + last = i; + } else if (last == i) { + for (unsigned j = 0; j < k; j++) { + if (arr[j] > arr[last]) { + last = j; + } + } + } + + if (++i == k) + i = 0; + } + + uint32_t min() { return arr[last]; } + + uint32_t min_hash() { return ~(uint32_t)(arr[last] >> 32); } + + void min_syncmer(std::vector &vec) { + if (arr[last] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i]); + } + } + + void min_syncmer(std::vector> &vec) { + if (arr[last] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i], ~(uint32_t)(arr[last] >> 32)); + } + } +}; + +/** + * @brief Adaptive data structure. Selects between Naive and Naive2 based on the + * large window size. + */ +struct Adaptive { + uint32_t k, i = 0, last = 0; + std::vector arr; + + Adaptive(uint32_t k) : k(k), arr(k) {} + Adaptive(const Adaptive &other) = default; + Adaptive &operator=(const Adaptive &other) = default; + + void naive(uint32_t index, uint32_t hash) { + arr[i] = (uint64_t)~hash << 32 | index; + if (++i == k) + i = 0; + } + + void naive2(uint32_t index, uint32_t hash) { + // flip the hash bits so we can take the maximum + arr[i] = (uint64_t)~hash << 32 | index; + + if (arr[i] > arr[last]) { + last = i; + } else if (last == i) { + for (unsigned j = 0; j < k; j++) { + if (arr[j] > arr[last]) { + last = j; + } + } + } + + if (++i == k) + i = 0; + } + + void insert(uint32_t index, uint32_t hash) { + if (k < 16) { + naive(index, hash); + } else { + naive2(index, hash); + } + } + + uint32_t min() { + if (k < 16) { + int i = k - 1; + for (int j = k - 2; j >= 0; j--) { + if (arr[j] > arr[i]) { + i = j; + } + } + return arr[i]; + } else { + return arr[last]; + } + } + + uint32_t min_hash() { + if (k < 16) { + int i = k - 1; + for (int j = k - 2; j >= 0; j--) { + if (arr[j] > arr[i]) { + i = j; + } + } + return ~(uint32_t)(arr[i] >> 32); + } else { + return ~(uint32_t)(arr[last] >> 32); + } + } + + void min_syncmer(std::vector &vec) { + if (k < 16) { + unsigned int j = k - 1; + for (int l = k - 2; l >= 0; l--) { + if (arr[l] > arr[j]) { + j = l; + } + } + if (arr[j] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i]); + } + } else { + if (arr[last] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i]); + } + } + } + + void min_syncmer(std::vector> &vec) { + if (k < 16) { + unsigned int j = k - 1; + for (int l = k - 2; l >= 0; l--) { + if (arr[l] > arr[j]) { + j = l; + } + } + if (arr[j] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i], ~(uint32_t)(arr[j] >> 32)); + } + } else { + if (arr[last] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i], ~(uint32_t)(arr[last] >> 32)); + } + } + } +}; + +/** + * @brief Same as Adaptive but uses 64-bit hashes. + */ +struct Adaptive64 { + uint32_t k, i = 0, last = 0; + std::vector<__uint128_t> arr; + + Adaptive64(uint32_t k) : k(k), arr(k) {} + Adaptive64(const Adaptive64 &other) = default; + Adaptive64 &operator=(const Adaptive64 &other) = default; + + void naive(uint32_t index, uint64_t hash) { + arr[i] = (__uint128_t)~hash << 32 | index; + if (++i == k) + i = 0; + } + + void naive2(uint32_t index, uint64_t hash) { + // flip the hash bits so we can take the maximum + arr[i] = (__uint128_t)~hash << 32 | index; + + if (arr[i] > arr[last]) { + last = i; + } else if (last == i) { + for (int j = k - 1; j >= 0; j--) { + if (arr[j] > arr[last]) { + last = j; + } + } + } + + if (++i == k) + i = 0; + } + + void insert(uint32_t index, uint64_t hash) { + if (k < 16) { + naive(index, hash); + } else { + return naive2(index, hash); + } + } + + uint32_t min() { + if (k < 16) { + int i = k - 1; + for (int j = k - 2; j >= 0; j--) { + if (arr[j] > arr[i]) { + i = j; + } + } + return arr[i]; + } else { + return arr[last]; + } + } + + uint64_t min_hash() { + if (k < 16) { + int i = k - 1; + for (int j = k - 2; j >= 0; j--) { + if (arr[j] > arr[i]) { + i = j; + } + } + return ~(uint64_t)(arr[i] >> 32); + } else { + return ~(uint64_t)(arr[last] >> 32); + } + } + + void min_syncmer(std::vector &vec) { + if (k < 16) { + unsigned int j = k - 1; + for (int l = k - 2; l >= 0; l--) { + if (arr[l] > arr[j]) { + j = l; + } + } + if (arr[j] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i]); + } + } else { + if (arr[last] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i]); + } + } + } + + void min_syncmer(std::vector> &vec) { + if (k < 16) { + unsigned int j = k - 1; + for (int l = k - 2; l >= 0; l--) { + if (arr[l] > arr[j]) { + j = l; + } + } + if (arr[j] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i], ~(uint64_t)(arr[j] >> 32)); + } + } else { + if (arr[last] >> 32 == + std::max(uint32_t(arr[i] >> 32), + uint32_t(arr[i ? i - 1 : k - 1] >> 32))) { + vec.emplace_back(arr[i], ~(uint64_t)(arr[last] >> 32)); + } + } + } +}; +} // namespace digest::ds + +#endif // DATA_STRUCTURE_HPP diff --git a/include/digest/digester.hpp b/include/digest/digester.hpp index ed57fa3..bef9ab0 100644 --- a/include/digest/digester.hpp +++ b/include/digest/digester.hpp @@ -1,330 +1,650 @@ -#pragma once - -#include -#include -#include -#include - -namespace digest { - -class BadConstructionException : public std::exception { - const char *what() const throw() { - return "k must be greater than 3, start must be less than len"; - } -}; - -class NotRolledTillEndException : public std::exception { - const char *what() const throw() { - return "Iterator must be at the end of the current sequence before " - "appending a new one."; - } -}; - -enum class MinimizedHashType { CANON, FORWARD, REVERSE }; - -enum class BadCharPolicy { WRITEOVER, SKIPOVER }; - -// Only supports characters in DNA and N, upper or lower case -template class Digester { -public: - /** - * @param seq char pointer poitning to the c-string of DNA sequence to be - * hashed. - * @param len length of seq. - * @param k k-mer size. - * @param start 0-indexed position in seq to start hashing from. - * @param minimized_h hash to be minimized, 0 for canoncial, 1 for forward, 2 - * for reverse - * - * @throws BadConstructionException Thrown if k is less than 4, - * or if the starting position is after the end of the string - * or if minimized_h is greater than 2 - */ - Digester(const char *seq, size_t len, unsigned k, size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : seq(seq), len(len), offset(0), start(start), end(start + k), chash(0), - fhash(0), rhash(0), k(k), minimized_h(minimized_h) { - if (k < 4 or start >= len or (int) minimized_h > 2) { - throw BadConstructionException(); - } - init_hash(); - } - - /** - * @param seq reference to std string of DNA sequence to be hashed. - * @param k k-mer size. - * @param start 0-indexed position in seq to start hashing from. - * @param minimized_h hash to be minimized, 0 for canoncial, 1 for forward, 2 - * for reverse - * - * @throws BadConstructionException Thrown if k is less than 4, - * or if the starting position is after the end of the string - */ - Digester(const std::string &seq, unsigned k, size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : Digester(seq.c_str(), seq.size(), k, start, minimized_h) {} - - virtual ~Digester() = default; - - /** - * @return bool, true if values of the 3 hashes are meaningful, false - * otherwise, i.e. the object wasn't able to initialize with a valid hash or - * roll_one() was called when already at end of sequence - */ - bool get_is_valid_hash() { return is_valid_hash; } - - unsigned get_k() { return k; } - - size_t get_len() { return len; } - - /** - * @brief moves the internal pointer to the next valid k-mer, skipping over - * any k-mers that have contain a non ACTG character, and returns hash for - * that k-mer Time Complexity: O(1) - * - * @return bool, true if we were able generate a valid hash, false otherwise - */ - bool roll_one() { - if (P == BadCharPolicy::SKIPOVER) { - return roll_one_skip_over(); - } else { - return roll_one_write_over(); - } - }; - - /** - * @brief returns the positions, as defined by get_pos(), of minimizers up to - * the amount specified - * - * @param amount number of minimizers you want to generate - * @param vec a reference to a vector of size_t's, the positions returned will - * go there - */ - virtual void roll_minimizer(unsigned amount, std::vector &vec) = 0; - - /** - * @brief returns the positions (pair.first), as defined by get_pos(), and the - * hashes (pair.second) of minimizers up to the amount specified - * - * @param amount number of minimizers you want to generate - * @param vec a reference to a vector of size_t's, the positions returned will - * go there - */ - virtual void - roll_minimizer(unsigned amount, - std::vector> &vec) = 0; - - /** - * @return current index of the first character of the current kmer that has - * been hashed strings that have been appended onto each other count as 1 big - * string, i.e. if you first had a string of length 10 and then appended - * another string of length 20, and the index of the first character of the - * current k-mer is at index 4, 0-indexed, in the second string, then it will - * return 14 - */ - size_t get_pos() { return offset + start - c_outs.size(); } - - uint64_t get_chash() { return chash; } - - uint64_t get_fhash() { return fhash; } - - uint64_t get_rhash() { return rhash; } - - /** - * @brief replaces the current sequence with the new one, it's like starting - * over with a completely new string - * - * @param seq char pointer to new sequence to be hashed - * @param len length of the new sequence - * @param start position in new sequence to start from - * - * @throws BadConstructionException thrown if the starting position is greater - * than the length of the string - */ - void new_seq(const char *seq, size_t len, size_t start) { - this->seq = seq; - this->len = len; - this->offset = 0; - this->start = start; - this->end = start + this->k; - is_valid_hash = false; - if (start >= len) { - throw BadConstructionException(); - } - init_hash(); - } - - /** - * @brief replaces the current sequence with the new one, it's like starting - * over with a completely new string - * - * @param seq std string reference to the new sequence to be hashed - * @param start position in new sequence to start from - * - * @throws BadConstructionException thrown if the starting position is greater - * than the length of the string - */ - void new_seq(const std::string &seq, size_t pos) { - new_seq(seq.c_str(), seq.size(), pos); - } - - /** - * @brief simulates the appending of a new sequence to the end of the old - * sequence The old string will no longer be stored, but the rolling hashes - * will be able to preceed as if the strings were appended Can only be called - * when you've reached the end of the current string i.e. if you're current - * sequence is ACTGAC, and you have reached the end of this sequence, and you - * call append_seq with the sequence CCGGCCGG, then the minimizers you will - * get after calling append_seq plus the minimizers you got from going through - * ACTGAC, will be equivalent to the minimizers you would have gotten from - * rolling across ACTGACCCGGCCGG - * - * @param seq C string of DNA sequence to be appended - * @param len length of the sequence - * - * @throws NotRolledTillEndException Thrown when the internal iterator is not - * at the end of the current sequence - */ - void append_seq(const char *seq, size_t len) { - if (P == BadCharPolicy::SKIPOVER) { - append_seq_skip_over(seq, len); - } else { - append_seq_write_over(seq, len); - } - } - - /** - * @brief simulates the appending of a new sequence to the end of the old - * sequence The old string will no longer be stored, but the rolling hashes - * will be able to preceed as if the strings were appended Can only be called - * when you've reached the end of the current string i.e. if you're current - * sequence is ACTGAC, and you have reached the end of this sequence, and you - * call append_seq with the sequence CCGGCCGG, then the minimizers you will - * get after calling append_seq plus the minimizers you got from going through - * ACTGAC, will be equivalent to the minimizers you would have gotten from - * rolling across ACTGACCCGGCCGG - * - * @param seq std string of DNA sequence to be appended - * - * @throws NotRolledTillEndException Thrown when the internal iterator is not - * at the end of the current sequence - */ - void append_seq(const std::string &seq) { - if (P == BadCharPolicy::SKIPOVER) { - append_seq_skip_over(seq.c_str(), seq.size()); - } else { - append_seq_write_over(seq.c_str(), seq.size()); - } - } - - /** - * @return unsigned, a number representing the hash you are minimizing, 0 for - * canoncial, 1 for forward, 2 for reverse - */ - MinimizedHashType get_minimized_h() { return minimized_h; } - - /** - * @return const char* representation of the sequence - */ - const char *get_sequence() { return seq; } - -protected: - // 0x41 = 'A', 0x43 = 'C', 0x47 = 'G' 0x54 = 'T' - // 0x61 = 'a', 0x63 = 'c', 0x67 = 'g' 0x74 = 't' - std::array actg{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT - }; - - /** - * Helper function - * - * @param in char to be checked - * @return bool, true if in is an upper or lowercase ACTG character, false - * otherwise - */ - bool is_ACTG(char in) { return actg[in]; } - - /** - * @brief Helper function that initializes the hash values at the first valid - * k-mer at or after start Sets is_valid_hash to be equal to its return value - * - * @return bool, true on success, a valid hash is initialized, false otherwise - */ - bool init_hash() { - if (P == BadCharPolicy::SKIPOVER) { - return init_hash_skip_over(); - } else { - return init_hash_write_over(); - } - }; - - void append_seq_skip_over(const char *seq, size_t len); - - void append_seq_write_over(const char *seq, size_t len); - - bool init_hash_skip_over(); - - bool init_hash_write_over(); - - bool roll_one_skip_over(); - - bool roll_one_write_over(); - - // sequence to be digested, memory is owned by the user - const char *seq; - - // length of seq - size_t len; - - // the combined length of all the previous strings that have been appended - // together, not counting the current string - size_t offset; - - // internal index of the next character to be thrown out, junk if c_outs is - // not empty - size_t start; - - // internal index of next character to be added - size_t end; - - // canonical hash of current k-mer - uint64_t chash; - - // forward hash of current k-mer - uint64_t fhash; - - // reverse hash of current k-mer - uint64_t rhash; - - // length of kmer - unsigned k; - - // deque of characters to be rolled out in the rolling hash from left to right - std::deque c_outs; - - // Hash value to be minimized, 0 for canonical, 1 for forward, 2 for reverse - MinimizedHashType minimized_h; - - // bool representing whether the current hash is meaningful, i.e. corresponds - // to the k-mer at get_pos() - bool is_valid_hash = false; -}; - -} // namespace digest - -#include "digester.tpp" +#ifndef DIGESTER_HPP +#define DIGESTER_HPP + +#include +#include +#include +#include + +/** + * @brief digest code. + */ +namespace digest { + +/** + * @brief Exception thrown when initializing a Digister with k (kmer size) < 4 + * and start (starting index) < len (length of sequence). + * + * + */ +class BadConstructionException : public std::exception { + const char *what() const throw() { + return "k must be greater than 3, start must be less than len"; + } +}; + +/** + * @brief Exception thrown when append_seq() is called before all kmers/large + * windows in the current sequence have been iterated over + * + */ +class NotRolledTillEndException : public std::exception { + const char *what() const throw() { + return "Iterator must be at the end of the current sequence before " + "appending a new one."; + } +}; + +/** + * @brief Enum values for the type of hash to minimize + */ +enum class MinimizedHashType { + /** minimize the canonical hash */ + CANON, + /** minimize the forward hash */ + FORWARD, + /** minimize the reverse hash */ + REVERSE +}; + +/** + * @brief Specifies behavior with non-ACTG characters. + */ +enum class BadCharPolicy { + /** The WRITEOVER policy specifies that any non-ACTG character is simply + replaced with an A. */ + WRITEOVER, + /** The SKIPOVER policy skips over any kmers with a non-ACTG character. + * + * For example, if you have k = 4 and your sequence is ACTGNNACTGAC, then + * the only kmers that would be considered would be the ACTG starting at + * index 0, the ACTG starting at index 6, CTGA at index 7, and TGAC at + * index 8. Then if you had a large window of 4 (kmers), then the smallest + * would be picked from one of those 4. + */ + SKIPOVER +}; + +/** + * @brief an abstract class for Digester objects. + * + * @tparam a BadCharPolicy enum value. The policy to adopt when handling + * non-ACTG characters. + * + */ +template class Digester { + public: + /** + * @param seq const char pointer pointing to the c-string of DNA sequence to + * be hashed. + * @param len length of seq. + * @param k kmer size. + * @param start 0-indexed position in seq to start hashing from. + * @param minimized_h whether we are minimizing the canonical, forward, or + * reverse hash + * + * @throws BadConstructionException Thrown if k is less than 4, + * or if the starting position is after the end of the string + */ + Digester(const char *seq, size_t len, unsigned k, size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : seq(seq), len(len), offset(0), start(start), end(start + k), chash(0), + fhash(0), rhash(0), k(k), minimized_h(minimized_h) { + if (k < 4 or start >= len or (int) minimized_h > 2) { + throw BadConstructionException(); + } + init_hash(); + } + + /** + * @param seq const string of the DNA sequence to be hashed. + * @param k kmer size. + * @param start 0-indexed position in seq to start hashing from. + * @param minimized_h whether we are minimizing the canonical, forward, or + * reverse hash + * + * @throws BadConstructionException Thrown if k is less than 4, + * or if the starting position is after the end of the string + */ + Digester(const std::string &seq, unsigned k, size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : Digester(seq.c_str(), seq.size(), k, start, minimized_h) {} + + virtual ~Digester() = default; + + /** + * @return bool, true if values of the 3 hashes are meaningful, false + * otherwise, i.e. the object wasn't able to initialize with a valid hash or + * roll_one() was called when already at end of sequence + */ + bool get_is_valid_hash() { return is_valid_hash; } + + /** + * @return unsigned, the value of k (kmer size) + */ + unsigned get_k() { return k; } + + /** + * @return size_t, the length of the sequence + */ + size_t get_len() { return len; } + + /** + * @brief moves the internal pointer to the next valid k-mer.
+ * Time Complexity: O(1) + * + * @return bool, true if we were able generate a valid hash, false otherwise + */ + bool roll_one() { + if (P == BadCharPolicy::SKIPOVER) { + return roll_one_skip_over(); + } else { + return roll_one_write_over(); + } + }; + + /** + * @brief gets the positions, as defined by get_pos(), of minimizers up to + * the amount specified + * + * @param amount number of minimizers you want to generate + * @param vec a reference to a vector of uint32_t, the positions returned + * will go there + */ + virtual void roll_minimizer(unsigned amount, + std::vector &vec) = 0; + + /** + * @brief gets the positions (pair.first), as defined by get_pos(), and the + * hashes (pair.second) of minimizers up to the amount specified + * + * @param amount number of minimizers you want to generate + * @param vec a reference to a vector of a pair of uint32_t, the positions + * and hashes returned will go there + */ + virtual void + roll_minimizer(unsigned amount, + std::vector> &vec) = 0; + + /** + * @return current index of the first character of the current kmer that has + * been hashed. Sequences that have been appended onto each other count as 1 + * big sequence, i.e. if you first had a sequence of length 10 and then + * appended another sequence of length 20, and the index of the first + * character of the current k-mer is at index 4, 0-indexed, in the second + * sequence, then it will return 14 + */ + size_t get_pos() { return offset + start - c_outs.size(); } + + /** + * @return uint64_t, the canonical hash of the kmer that was rolled over + * when roll_one was last called (roll_minimizer() calls roll_one() + * internally). + */ + uint64_t get_chash() { return chash; } + + /** + * @return uint64_t, the forward hash of the kmer that was rolled over when + * roll_one was last called (roll_minimizer() calls roll_one() internally). + */ + uint64_t get_fhash() { return fhash; } + + /** + * @return uint64_t, the reverse hash of the kmer that was rolled over when + * roll_one was last called (roll_minimizer() calls roll_one() internally). + */ + uint64_t get_rhash() { return rhash; } + + /** + * @brief replaces the current sequence with the new one. It's like starting + * over with a completely new seqeunce + * + * @param seq const char pointer to new sequence to be hashed + * @param len length of the new sequence + * @param start position in new sequence to start from + * + * @throws BadConstructionException thrown if the starting position is + * greater than the length of the string + */ + void new_seq(const char *seq, size_t len, size_t start) { + this->seq = seq; + this->len = len; + this->offset = 0; + this->start = start; + this->end = start + this->k; + is_valid_hash = false; + if (start >= len) { + throw BadConstructionException(); + } + init_hash(); + } + + /** + * @brief replaces the current sequence with the new one. It's like starting + * over with a completely new sequence + * + * @param seq const std string reference to the new sequence to be hashed + * @param start position in new sequence to start from + * + * @throws BadConstructionException thrown if the starting position is + * greater than the length of the string + */ + void new_seq(const std::string &seq, size_t pos) { + new_seq(seq.c_str(), seq.size(), pos); + } + + /** + * @brief simulates the appending of a new sequence to the end of the old + * sequence. The old sequence will no longer be stored, but the rolling + * hashes will be able to preceed as if the sequences were appended. Can + * only be called when you've reached the end of the current sequence i.e. + * if you're current sequence is ACTGAC, and you have reached the end of + * this sequence, and you call append_seq with the sequence CCGGCCGG, then + * the minimizers you will get after calling append_seq plus the minimizers + * you got from going through ACTGAC, will be equivalent to the minimizers + * you would have gotten from rolling across ACTGACCCGGCCGG + * + * @param seq const C string of DNA sequence to be appended + * @param len length of the sequence + * + * @throws NotRolledTillEndException Thrown when the internal iterator is + * not at the end of the current sequence + */ + void append_seq(const char *seq, size_t len) { + if (P == BadCharPolicy::SKIPOVER) { + append_seq_skip_over(seq, len); + } else { + append_seq_write_over(seq, len); + } + } + + /** + * @brief simulates the appending of a new sequence to the end of the old + * sequence. The old sequence will no longer be stored, but the rolling + * hashes will be able to preceed as if the sequences were appended. Can + * only be called when you've reached the end of the current sequence i.e. + * if you're current sequence is ACTGAC, and you have reached the end of + * this sequence, and you call append_seq with the sequence CCGGCCGG, then + * the minimizers you will get after calling append_seq plus the minimizers + * you got from going through ACTGAC, will be equivalent to the minimizers + * you would have gotten from rolling across ACTGACCCGGCCGG + * + * @param seq const std string of DNA sequence to be appended + * + * @throws NotRolledTillEndException Thrown when the internal iterator is + * not at the end of the current sequence + */ + void append_seq(const std::string &seq) { + if (P == BadCharPolicy::SKIPOVER) { + append_seq_skip_over(seq.c_str(), seq.size()); + } else { + append_seq_write_over(seq.c_str(), seq.size()); + } + } + + /** + * @return unsigned, a number representing the hash you are minimizing, 0 + * for canoncial, 1 for forward, 2 for reverse + */ + MinimizedHashType get_minimized_h() { return minimized_h; } + + /** + * @return const char* representation of the sequence + */ + const char *get_sequence() { return seq; } + + protected: + // 0x41 = 'A', 0x43 = 'C', 0x47 = 'G' 0x54 = 'T' + // 0x61 = 'a', 0x63 = 'c', 0x67 = 'g' 0x74 = 't' + std::array actg{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT + }; + + /** + * @internal + * Helper function + * + * @param in char to be checked + * + * @return bool, true if in is an upper or lowercase ACTG character, false + * otherwise + */ + bool is_ACTG(char in) { return actg[in]; } + + /** + * @internal + * + * @brief Helper function that initializes the hash values at the first + * valid k-mer at or after start Sets is_valid_hash to be equal to its + * return value + * + * @return bool, true on success, a valid hash is initialized, false + * otherwise + */ + bool init_hash() { + if (P == BadCharPolicy::SKIPOVER) { + return init_hash_skip_over(); + } else { + return init_hash_write_over(); + } + }; + + void append_seq_skip_over(const char *seq, size_t len) { + if (end < this->len) { + throw NotRolledTillEndException(); + } + offset += this->len; + size_t ind = this->len - 1; + + /* + this is for the case where we call append_seq after having + previously called append_seq and not having gotten through the deque + In such a case, since append_seq initializes a hash, we need to get + rid of the first character in the deque since if we just initialized + the hash without doing this, it would be identical the the current + hash held by the object + + However, there is also the case that a hash was never previously + initialized, such as when the length of the string used in the + previous append_seq call, plus the the amount of ACTG characters + after the last non-ACTG character in the original string summed to be + less than k In this case, it would not be correct to remove the first + character in the deque + */ + if ((start != end || c_outs.size() == k) && c_outs.size() > 0) { + c_outs.pop_front(); + } + + // the following copies in characters from the end of the old sequence + // into the deque + std::vector temp_vec; + while (temp_vec.size() + c_outs.size() < k - 1 && ind >= start) { + if (!is_ACTG(this->seq[ind])) + break; + + temp_vec.push_back(this->seq[ind]); + if (ind == 0) + break; + + ind--; + } + for (std::vector::reverse_iterator rit = temp_vec.rbegin(); + rit != temp_vec.rend(); rit++) { + c_outs.push_back(*rit); + } + + // the following copies in characters from the front of the new sequence + // if there weren't enough non-ACTG characters at the end of the old + // sequence + ind = 0; + start = 0; + end = 0; + while (c_outs.size() < k && ind < len) { + if (!is_ACTG(seq[ind])) { + start = ind + 1; + end = start + k; + this->seq = seq; + this->len = len; + c_outs.clear(); + init_hash(); + break; + } + c_outs.push_back(seq[ind]); + ind++; + start++; + end++; + } + + // the following initializes a hash if we managed to fill the deque + if (c_outs.size() == k) { + std::string temp(c_outs.begin(), c_outs.end()); + // nthash::ntc64(temp.c_str(), k, fhash, rhash, chash, + // locn_useless); + fhash = base_forward_hash(temp.c_str(), k); + rhash = base_reverse_hash(temp.c_str(), k); + chash = nthash::canonical(fhash, rhash); + is_valid_hash = true; + } + this->seq = seq; + this->len = len; + } + + void append_seq_write_over(const char *seq, size_t len) { + if (end < this->len) { + throw NotRolledTillEndException(); + } + offset += this->len; + size_t ind = this->len - 1; + + if ((start != end || c_outs.size() == k) && c_outs.size() > 0) { + c_outs.pop_front(); + } + + // the following copies in characters from the end of the old sequence + // into the deque + std::vector temp_vec; + while (temp_vec.size() + c_outs.size() < k - 1 && ind >= start) { + if (!is_ACTG(this->seq[ind])) { + temp_vec.push_back('A'); + } else { + temp_vec.push_back(this->seq[ind]); + } + if (ind == 0) + break; + + ind--; + } + for (std::vector::reverse_iterator rit = temp_vec.rbegin(); + rit != temp_vec.rend(); rit++) { + c_outs.push_back(*rit); + } + + // the following copies in characters from the front of the new sequence + // if there weren't enough non-ACTG characters at the end of the old + // sequence + ind = 0; + start = 0; + end = 0; + while (c_outs.size() < k && ind < len) { + if (!is_ACTG(seq[ind])) { + c_outs.push_back('A'); + } else { + c_outs.push_back(seq[ind]); + } + + ind++; + start++; + end++; + } + + // the following initializes a hash if we managed to fill the deque + if (c_outs.size() == k) { + std::string temp(c_outs.begin(), c_outs.end()); + // nthash::ntc64(temp.c_str(), k, fhash, rhash, chash, + // locn_useless); + fhash = base_forward_hash(temp.c_str(), k); + rhash = base_reverse_hash(temp.c_str(), k); + chash = nthash::canonical(fhash, rhash); + is_valid_hash = true; + } + this->seq = seq; + this->len = len; + } + + bool init_hash_skip_over() { + c_outs.clear(); + while (end - 1 < len) { + bool works = true; + for (size_t i = start; i < end; i++) { + if (!is_ACTG(seq[i])) { + start = i + 1; + end = start + k; + works = false; + break; + } + } + if (!works) { + continue; + } + // nthash::ntc64(seq + start, k, fhash, rhash, chash, locn_useless); + fhash = base_forward_hash(seq + start, k); + rhash = base_reverse_hash(seq + start, k); + chash = nthash::canonical(fhash, rhash); + is_valid_hash = true; + return true; + } + is_valid_hash = false; + return false; + } + + // need to do a good bit of rewriting + // not performance critical so it's kinda whatever + bool init_hash_write_over() { + c_outs.clear(); + while (end - 1 < len) { + std::string init_str; + for (size_t i = start; i < end; i++) { + if (!is_ACTG(seq[i])) { + init_str.push_back('A'); + } else { + init_str.push_back(seq[i]); + } + } + + // nthash::ntc64(seq + start, k, fhash, rhash, chash, locn_useless); + fhash = base_forward_hash(init_str.c_str(), k); + rhash = base_reverse_hash(init_str.c_str(), k); + chash = nthash::canonical(fhash, rhash); + is_valid_hash = true; + return true; + } + is_valid_hash = false; + return false; + } + + bool roll_one_skip_over() { + if (!is_valid_hash) { + return false; + } + if (end >= len) { + is_valid_hash = false; + return false; + } + if (c_outs.size() > 0) { + if (is_ACTG(seq[end])) { + fhash = next_forward_hash(fhash, k, c_outs.front(), seq[end]); + rhash = next_reverse_hash(rhash, k, c_outs.front(), seq[end]); + c_outs.pop_front(); + end++; + chash = nthash::canonical(fhash, rhash); + return true; + } else { + // c_outs will contain at most k-1 characters, so if we jump to + // end + 1, we won't consider anything else in deque so we + // should clear it + c_outs.clear(); + start = end + 1; + end = start + k; + return init_hash(); + } + } else { + if (is_ACTG(seq[end])) { + fhash = next_forward_hash(fhash, k, seq[start], seq[end]); + rhash = next_reverse_hash(rhash, k, seq[start], seq[end]); + start++; + end++; + chash = nthash::canonical(fhash, rhash); + return true; + } else { + start = end + 1; + end = start + k; + return init_hash(); + } + } + } + + bool roll_one_write_over() { + if (!is_valid_hash) { + return false; + } + if (end >= len) { + is_valid_hash = false; + return false; + } + char next_char = is_ACTG(seq[end]) ? seq[end] : 'A'; + if (c_outs.size() > 0) { + fhash = next_forward_hash(fhash, k, c_outs.front(), next_char); + rhash = next_reverse_hash(rhash, k, c_outs.front(), next_char); + c_outs.pop_front(); + end++; + + } else { + char out_char = is_ACTG(seq[start]) ? seq[start] : 'A'; + fhash = next_forward_hash(fhash, k, out_char, next_char); + rhash = next_reverse_hash(rhash, k, out_char, next_char); + start++; + end++; + } + chash = nthash::canonical(fhash, rhash); + return true; + } + + // sequence to be digested, memory is owned by the user + const char *seq; + + // length of seq + size_t len; + + // the combined length of all the previous strings that have been appended + // together, not counting the current string + size_t offset; + + // internal index of the next character to be thrown out, junk if c_outs is + // not empty + size_t start; + + // internal index of next character to be added + size_t end; + + // canonical hash of current k-mer + uint64_t chash; + + // forward hash of current k-mer + uint64_t fhash; + + // reverse hash of current k-mer + uint64_t rhash; + + // length of kmer + unsigned k; + + // deque of characters to be rolled out in the rolling hash from left to + // right + std::deque c_outs; + + // Hash value to be minimized, 0 for canonical, 1 for forward, 2 for reverse + MinimizedHashType minimized_h; + + // bool representing whether the current hash is meaningful, i.e. + // corresponds to the k-mer at get_pos() + bool is_valid_hash = false; +}; + +} // namespace digest + +#endif // DIGESTER_HPP diff --git a/include/digest/digester.tpp b/include/digest/digester.tpp deleted file mode 100644 index 889d247..0000000 --- a/include/digest/digester.tpp +++ /dev/null @@ -1,252 +0,0 @@ -#include "digest/digester.hpp" - -namespace digest{ - - template - void Digester

::append_seq_skip_over(const char* seq, size_t len){ - if(end < this->len){ - throw NotRolledTillEndException(); - } - offset += this->len; - size_t ind = this->len-1; - - /* - this is for the case where we call append_seq after having previously called append_seq and not having gotten through the deque - In such a case, since append_seq initializes a hash, we need to get rid of the first character in the deque since if we just initialized the hash - without doing this, it would be identical the the current hash held by the object - - However, there is also the case that a hash was never previously initialized, such as when the length of the string used in the previous append_seq call, - plus the the amount of ACTG characters after the last non-ACTG character in the original string summed to be less than k - In this case, it would not be correct to remove the first character in the deque - */ - if((start != end || c_outs.size() == k) && c_outs.size() > 0){ - c_outs.pop_front(); - } - - // the following copies in characters from the end of the old sequence into the deque - std::vector temp_vec; - while(temp_vec.size() + c_outs.size()< k-1 && ind >= start){ - if(!is_ACTG(this->seq[ind])) break; - - temp_vec.push_back(this->seq[ind]); - if(ind == 0) break; - - ind--; - } - for(std::vector::reverse_iterator rit = temp_vec.rbegin(); rit != temp_vec.rend(); rit++){ - c_outs.push_back(*rit); - } - - // the following copies in characters from the front of the new sequence if there weren't enough non-ACTG characters at the end of the old sequence - ind = 0; - start = 0; - end = 0; - while(c_outs.size() < k && ind < len){ - if(!is_ACTG(seq[ind])){ - start = ind+1; - end = start + k; - this->seq = seq; - this->len = len; - c_outs.clear(); - init_hash(); - break; - } - c_outs.push_back(seq[ind]); - ind++; - start++; - end++; - } - - // the following initializes a hash if we managed to fill the deque - if(c_outs.size() == k){ - std::string temp(c_outs.begin(), c_outs.end()); - // nthash::ntc64(temp.c_str(), k, fhash, rhash, chash, locn_useless); - fhash = base_forward_hash(temp.c_str(), k); - rhash = base_reverse_hash(temp.c_str(), k); - chash = nthash::canonical(fhash, rhash); - is_valid_hash = true; - - } - this->seq = seq; - this->len = len; - } - - template - void Digester

::append_seq_write_over(const char* seq, size_t len){ - if(end < this->len){ - throw NotRolledTillEndException(); - } - offset += this->len; - size_t ind = this->len-1; - - if((start != end || c_outs.size() == k) && c_outs.size() > 0){ - c_outs.pop_front(); - } - - // the following copies in characters from the end of the old sequence into the deque - std::vector temp_vec; - while(temp_vec.size() + c_outs.size()< k-1 && ind >= start){ - if(!is_ACTG(this->seq[ind])){ - temp_vec.push_back('A'); - }else{ - temp_vec.push_back(this->seq[ind]); - } - if(ind == 0) break; - - ind--; - } - for(std::vector::reverse_iterator rit = temp_vec.rbegin(); rit != temp_vec.rend(); rit++){ - c_outs.push_back(*rit); - } - - // the following copies in characters from the front of the new sequence if there weren't enough non-ACTG characters at the end of the old sequence - ind = 0; - start = 0; - end = 0; - while(c_outs.size() < k && ind < len){ - if(!is_ACTG(seq[ind])){ - c_outs.push_back('A'); - }else{ - c_outs.push_back(seq[ind]); - } - - ind++; - start++; - end++; - } - - // the following initializes a hash if we managed to fill the deque - if(c_outs.size() == k){ - std::string temp(c_outs.begin(), c_outs.end()); - // nthash::ntc64(temp.c_str(), k, fhash, rhash, chash, locn_useless); - fhash = base_forward_hash(temp.c_str(), k); - rhash = base_reverse_hash(temp.c_str(), k); - chash = nthash::canonical(fhash, rhash); - is_valid_hash = true; - - } - this->seq = seq; - this->len = len; - } - - template - bool Digester

::init_hash_skip_over(){ - c_outs.clear(); - while(end-1 < len){ - bool works = true; - for(size_t i = start; i < end; i++){ - if(!is_ACTG(seq[i])){ - start = i+1; - end = start + k; - works = false; - break; - } - } - if(!works){ - continue; - } - // nthash::ntc64(seq + start, k, fhash, rhash, chash, locn_useless); - fhash = base_forward_hash(seq + start, k); - rhash = base_reverse_hash(seq + start, k); - chash = nthash::canonical(fhash, rhash); - is_valid_hash = true; - return true; - } - is_valid_hash = false; - return false; - } - - // need to do a good bit of rewriting - // not performance critical so it's kinda whatever - template - bool Digester

::init_hash_write_over(){ - c_outs.clear(); - while(end-1 < len){ - std::string init_str; - for(size_t i = start; i < end; i++){ - if(!is_ACTG(seq[i])){ - init_str.push_back('A'); - }else{ - init_str.push_back(seq[i]); - } - } - - // nthash::ntc64(seq + start, k, fhash, rhash, chash, locn_useless); - fhash = base_forward_hash(init_str.c_str(), k); - rhash = base_reverse_hash(init_str.c_str(), k); - chash = nthash::canonical(fhash, rhash); - is_valid_hash = true; - return true; - } - is_valid_hash = false; - return false; - } - - template - bool Digester

::roll_one_skip_over(){ - if(!is_valid_hash){ - return false; - } - if(end >= len){ - is_valid_hash = false; - return false; - } - if(c_outs.size() > 0){ - if(is_ACTG(seq[end])){ - fhash = next_forward_hash(fhash, k, c_outs.front(), seq[end]); - rhash = next_reverse_hash(rhash, k, c_outs.front(), seq[end]); - c_outs.pop_front(); - end++; - chash = nthash::canonical(fhash, rhash); - return true; - }else{ - // c_outs will contain at most k-1 characters, so if we jump to end + 1, we won't consider anything else in deque so we should clear it - c_outs.clear(); - start = end+1; - end = start + k; - return init_hash(); - } - }else{ - if(is_ACTG(seq[end])){ - fhash = next_forward_hash(fhash, k, seq[start], seq[end]); - rhash = next_reverse_hash(rhash, k, seq[start], seq[end]); - start++; - end++; - chash = nthash::canonical(fhash,rhash); - return true; - }else{ - start = end+1; - end = start + k; - return init_hash(); - } - } - } - - template - bool Digester

::roll_one_write_over(){ - if(!is_valid_hash){ - return false; - } - if(end >= len){ - is_valid_hash = false; - return false; - } - char next_char = is_ACTG(seq[end]) ? seq[end] : 'A'; - if(c_outs.size() > 0){ - fhash = next_forward_hash(fhash, k, c_outs.front(), next_char); - rhash = next_reverse_hash(rhash, k, c_outs.front(), next_char); - c_outs.pop_front(); - end++; - - }else{ - char out_char = is_ACTG(seq[start]) ? seq[start] : 'A'; - fhash = next_forward_hash(fhash, k, out_char, next_char); - rhash = next_reverse_hash(rhash, k, out_char, next_char); - start++; - end++; - } - chash = nthash::canonical(fhash, rhash); - return true; - - } -} diff --git a/include/digest/mod_minimizer.hpp b/include/digest/mod_minimizer.hpp index 431a807..69910fc 100644 --- a/include/digest/mod_minimizer.hpp +++ b/include/digest/mod_minimizer.hpp @@ -1,97 +1,169 @@ -#ifndef MOD_MINI_HPP -#define MOD_MINI_HPP - -#include "digest/digester.hpp" -#include -#include - -namespace digest { - -class BadModException : public std::exception { - const char *what() const throw() { - return "mod must be greater than congruence."; - } -}; -template class ModMin : public Digester

{ -public: - /** - * - * @param seq - * @param len - * @param k - * @param mod mod space to be used to calculate universal minimizers - * @param congruence value we want minimizer hashes to be congruent to in the - * mod space - * @param start - * @param minimized_h - * - * @throws BadModException Thrown when congruence is greater or equal to mod - */ - ModMin(const char *seq, size_t len, unsigned k, uint32_t mod, - uint32_t congruence = 0, size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : Digester

(seq, len, k, start, minimized_h), mod(mod), - congruence(congruence) { - if (congruence >= mod) { - throw BadModException(); - } - } - - /** - * - * @param seq - * @param k - * @param mod mod space to be used to calculate universal minimizers - * @param congruence value we want minimizer hashes to be congruent to in the - * mod space - * @param start - * @param minimized_h - * - * @throws BadModException Thrown when congruence is greater or equal to mod - */ - ModMin(const std::string &seq, unsigned k, uint32_t mod, - uint32_t congruence = 0, size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : ModMin

(seq.c_str(), seq.size(), k, mod, congruence, start, - minimized_h) {} - - /** - * @brief adds up to amount of positions of minimizers into vec, here a k-mer - * is considered a minimizer if its hash is congruent to congruence in the mod - * space Time Complexity: O(1) per k-mer tested - * - * @param amount - * @param vec - */ - void roll_minimizer(unsigned amount, std::vector &vec) override; - - /** - * @brief adds up to amount of positions and hashes of minimizers into vec, - * here a k-mer is considered a minimizer if its hash is congruent to - * congruence in the mod space Time Complexity: O(1) per k-mer tested - * - * @param amount - * @param vec - */ - void roll_minimizer(unsigned amount, - std::vector> &vec) override; - - /** - * @return uint32_t, the mod space being used - */ - uint32_t get_mod() { return mod; } - - /** - * @return uint32_t, the value the minimized hash must be congruent to - */ - uint32_t get_congruence() { return congruence; } - -private: - uint32_t mod; - uint32_t congruence; -}; - -} // namespace digest - -#include "mod_minimizer.tpp" -#endif +#ifndef MOD_MINIMIZER_HPP +#define MOD_MINIMIZER_HPP + +#include "digest/digester.hpp" +#include +#include + +namespace digest { + +/** + * @brief Exception thrown when initializing a mod minimizer object where the + * target value after modding is greater than the mod value. + * + * + */ +class BadModException : public std::exception { + const char *what() const throw() { + return "mod must be greater than congruence."; + } +}; + +/** + * @brief Child class of Digester that defines a minimizer as a kmer whose hash + * is equal to some target value after being modded. Parameters without a + * description are the same as the parameters in the Digester parent class. They + * are simply passed up to the parent constructor. + * + * @tparam P + */ +template class ModMin : public Digester

{ + public: + /** + * @brief + * + * @param seq + * @param len + * @param k + * @param mod mod space to be used to calculate universal minimizers + * @param congruence value we want minimizer hashes to be congruent to in + * the mod space + * @param start + * @param minimized_h + * + * @throws BadModException Thrown when congruence is greater or equal to mod + */ + ModMin(const char *seq, size_t len, unsigned k, uint32_t mod, + uint32_t congruence = 0, size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : Digester

(seq, len, k, start, minimized_h), mod(mod), + congruence(congruence) { + if (congruence >= mod) { + throw BadModException(); + } + } + + /** + * + * @param seq + * @param k + * @param mod mod space to be used to calculate universal minimizers + * @param congruence value we want minimizer hashes to be congruent to in + * the mod space + * @param start + * @param minimized_h + * + * @throws BadModException Thrown when congruence is greater or equal to mod + */ + ModMin(const std::string &seq, unsigned k, uint32_t mod, + uint32_t congruence = 0, size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : ModMin

(seq.c_str(), seq.size(), k, mod, congruence, start, + minimized_h) {} + + /** + * @brief adds up to amount of positions of minimizers into vec. Here a + * k-mer is considered a minimizer if its hash is congruent to congruence in + * the mod space. + * + * @param amount + * @param vec + */ + void roll_minimizer(unsigned amount, std::vector &vec) override { + if (!this->is_valid_hash) + return; + + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + do { + if ((uint32_t)this->chash % mod == congruence) { + vec.emplace_back(this->get_pos()); + } + } while (this->roll_one() && vec.size() < amount); + return; + } + + if (this->get_minimized_h() == digest::MinimizedHashType::FORWARD) { + do { + if ((uint32_t)this->fhash % mod == congruence) { + vec.emplace_back(this->get_pos()); + } + } while (this->roll_one() && vec.size() < amount); + return; + } + + // reverse + do { + if ((uint32_t)this->rhash % mod == congruence) { + vec.emplace_back(this->get_pos()); + } + } while (this->roll_one() && vec.size() < amount); + } + + /** + * @brief adds up to amount of positions and hashes of minimizers into vec. + * Here a k-mer is considered a minimizer if its hash is congruent to + * congruence in the mod space. + * + * @param amount + * @param vec + */ + void + roll_minimizer(unsigned amount, + std::vector> &vec) override { + if (!this->is_valid_hash) + return; + + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + do { + if ((uint32_t)this->chash % mod == congruence) { + vec.emplace_back(this->get_pos(), this->chash); + } + } while (this->roll_one() && vec.size() < amount); + return; + } + + if (this->get_minimized_h() == digest::MinimizedHashType::FORWARD) { + do { + if ((uint32_t)this->fhash % mod == congruence) { + vec.emplace_back(this->get_pos(), this->fhash); + } + } while (this->roll_one() && vec.size() < amount); + return; + } + + // reverse + do { + if ((uint32_t)this->rhash % mod == congruence) { + vec.emplace_back(this->get_pos(), this->rhash); + } + } while (this->roll_one() && vec.size() < amount); + } + + /** + * @return uint32_t, the mod space being used + */ + uint32_t get_mod() { return mod; } + + /** + * @return uint32_t, the value the minimized hash must be congruent to + */ + uint32_t get_congruence() { return congruence; } + + private: + uint32_t mod; + uint32_t congruence; +}; + +} // namespace digest + +#endif // MOD_MINIMIZER_HPP diff --git a/include/digest/mod_minimizer.tpp b/include/digest/mod_minimizer.tpp deleted file mode 100644 index c9f98f8..0000000 --- a/include/digest/mod_minimizer.tpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "digest/mod_minimizer.hpp" -#include - -namespace digest{ - - template - void ModMin

::roll_minimizer(unsigned amount, std::vector& vec){ - if(!this->is_valid_hash) return; - - if(this->get_minimized_h() == digest::MinimizedHashType::CANON) { - do { - if((uint32_t)this->chash % mod == congruence){ - vec.emplace_back(this->get_pos()); - } - } while(this->roll_one() && vec.size() < amount); - return; - } - - if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD) { - do { - if((uint32_t)this->fhash % mod == congruence){ - vec.emplace_back(this->get_pos()); - } - } while(this->roll_one() && vec.size() < amount); - return; - } - - // reverse - do { - if((uint32_t)this->rhash % mod == congruence){ - vec.emplace_back(this->get_pos()); - } - } while(this->roll_one() && vec.size() < amount); - } - - template - void ModMin

::roll_minimizer(unsigned amount, std::vector>& vec){ - if(!this->is_valid_hash) return; - - if(this->get_minimized_h() == digest::MinimizedHashType::CANON) { - do { - if((uint32_t)this->chash % mod == congruence){ - vec.emplace_back(this->get_pos(), this->chash); - } - } while(this->roll_one() && vec.size() < amount); - return; - } - - if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD) { - do { - if((uint32_t)this->fhash % mod == congruence){ - vec.emplace_back(this->get_pos(), this->fhash); - } - } while(this->roll_one() && vec.size() < amount); - return; - } - - // reverse - do { - if((uint32_t)this->rhash % mod == congruence){ - vec.emplace_back(this->get_pos(), this->rhash); - } - } while(this->roll_one() && vec.size() < amount); - } -} diff --git a/include/digest/syncmer.hpp b/include/digest/syncmer.hpp index 3a37170..49e188e 100644 --- a/include/digest/syncmer.hpp +++ b/include/digest/syncmer.hpp @@ -1,83 +1,161 @@ -#ifndef SYNC_HPP -#define SYNC_HPP -#include "digest/digester.hpp" -#include "digest/window_minimizer.hpp" - -namespace digest { - -// number of k-mers to be considered in the large window -template class Syncmer : public WindowMin { -public: - /** - * - * @param seq - * @param len - * @param k - * @param large_window - * @param start - * @param minimized_h - * - * @throws BadWindowException Thrown when congruence is greater or equal to - * mod - */ - Syncmer(const char *seq, size_t len, unsigned k, unsigned large_window, - size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : WindowMin(seq, len, k, large_window, start, minimized_h) {} - - /** - * - * @param seq - * @param k - * @param large_window - * @param start - * @param minimized_h - * - * @throws BadWindowException Thrown when congruence is greater or equal to - * mod - */ - Syncmer(const std::string &seq, unsigned k, unsigned large_window, - size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : Syncmer(seq.c_str(), seq.size(), k, large_window, start, - minimized_h) {} - - /** - * @brief - * - * @param amount - * @param vec - */ - void roll_minimizer(unsigned amount, std::vector &vec) override; - - /** - * @brief adds up to amount of positions and hashes of syncmers into vec, here - * a large window is considered a syncmer if the smallest hash in the large - * window is at the leftmost or rightmost position - * - * @param amount - * @param vec - */ - void roll_minimizer(unsigned amount, - std::vector> &vec) override; - -private: - /** - * @brief helper function which handles adding the next hash into the data - * structure - * - */ - void roll_ds_sync(std::vector &vec); - - /** - * @brief helper function which handles adding the next hash into the data - * structure - * - */ - void roll_ds_sync(std::vector> &vec); -}; - -} // namespace digest - -#include "syncmer.tpp" -#endif +#ifndef SYNCMER_HPP +#define SYNCMER_HPP + +#include "digest/digester.hpp" +#include "digest/window_minimizer.hpp" + +namespace digest { + +/** + * @brief This class inherits from WindowMinimizer (implementation reasons), but + * the represent very different things. A Syncmer is defined as a large window + * where the minimal hash among all kmers in the large window belong to either + * the leftmost or rightmost kmer. Parameters without a description are the same + * as the parameters in the Digester parent class. They are simply passed up to + * the parent constructor. + * + * @tparam P + * @tparam T The data structure to use for performing range minimum queries to + * find the minimal hash value. + */ +template class Syncmer : public WindowMin { + public: + /** + * + * @param seq + * @param len + * @param k + * @param large_window the number of kmers in the large window, i.e. the + * number of kmers to be considered during the range minimum query. + * @param start + * @param minimized_h + * + * @throws BadWindowException Thrown when large_window is passed in as 0 + */ + Syncmer(const char *seq, size_t len, unsigned k, unsigned large_window, + size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : WindowMin(seq, len, k, large_window, start, minimized_h) {} + + /** + * + * @param seq + * @param k + * @param large_window the number of kmers in the large window, i.e. the + * number of kmers to be considered during the range minimum query. + * @param start + * @param minimized_h + * + * @throws BadWindowException Thrown when large_window is passed in as 0 + */ + Syncmer(const std::string &seq, unsigned k, unsigned large_window, + size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : Syncmer(seq.c_str(), seq.size(), k, large_window, start, + minimized_h) {} + + /** + * @brief adds up to amount of positions of syncmers into vec. Here + * a large window is considered a syncmer if the smallest hash in the large + * window is at the leftmost or rightmost position. + * + * @param amount + * @param vec + */ + void roll_minimizer(unsigned amount, std::vector &vec) override { + amount += vec.size(); + + while (this->ds_size + 1 < this->large_window and this->is_valid_hash) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + this->ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + this->ds.insert(this->get_pos(), this->fhash); + } else { + this->ds.insert(this->get_pos(), this->rhash); + } + + this->roll_one(); + this->ds_size++; + } + + while (this->is_valid_hash and vec.size() < amount) { + Syncmer::roll_ds_sync(vec); + } + } + + /** + * @brief adds up to amount of positions and hashes of syncmers into vec. + * Here a large window is considered a syncmer if the smallest hash in the + * large window is at the leftmost or rightmost position. + * + * @param amount + * @param vec + */ + void + roll_minimizer(unsigned amount, + std::vector> &vec) override { + amount += vec.size(); + + while (this->ds_size + 1 < this->large_window and this->is_valid_hash) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + this->ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + this->ds.insert(this->get_pos(), this->fhash); + } else { + this->ds.insert(this->get_pos(), this->rhash); + } + + this->roll_one(); + this->ds_size++; + } + + while (this->is_valid_hash and vec.size() < amount) { + Syncmer::roll_ds_sync(vec); + } + } + + private: + /** + * @brief helper function which handles adding the next hash into the data + * structure + * + */ + void roll_ds_sync(std::vector &vec) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + this->ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + this->ds.insert(this->get_pos(), this->fhash); + } else { + this->ds.insert(this->get_pos(), this->rhash); + } + this->ds.min_syncmer(vec); + + this->roll_one(); + } + + /** + * @brief helper function which handles adding the next hash into the data + * structure + * + */ + void roll_ds_sync(std::vector> &vec) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + this->ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + this->ds.insert(this->get_pos(), this->fhash); + } else { + this->ds.insert(this->get_pos(), this->rhash); + } + this->ds.min_syncmer(vec); + + this->roll_one(); + } +}; + +} // namespace digest + +#endif // SYNCMER_HPP diff --git a/include/digest/syncmer.tpp b/include/digest/syncmer.tpp deleted file mode 100644 index a4645b6..0000000 --- a/include/digest/syncmer.tpp +++ /dev/null @@ -1,75 +0,0 @@ -#include "digest/syncmer.hpp" - -namespace digest{ - template - void Syncmer::roll_minimizer(unsigned amount, std::vector& vec){ - amount += vec.size(); - - while (this->ds_size + 1 < this->large_window and this->is_valid_hash) { - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - this->ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - this->ds.insert(this->get_pos(), this->fhash); - }else{ - this->ds.insert(this->get_pos(), this->rhash); - } - - this->roll_one(); - this->ds_size++; - } - - while (this->is_valid_hash and vec.size() < amount){ - Syncmer::roll_ds_sync(vec); - } - } - - template - void Syncmer::roll_minimizer(unsigned amount, std::vector>& vec){ - amount += vec.size(); - - while (this->ds_size + 1 < this->large_window and this->is_valid_hash) { - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - this->ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - this->ds.insert(this->get_pos(), this->fhash); - }else{ - this->ds.insert(this->get_pos(), this->rhash); - } - - this->roll_one(); - this->ds_size++; - } - - while (this->is_valid_hash and vec.size() < amount){ - Syncmer::roll_ds_sync(vec); - } - } - - template - void Syncmer::roll_ds_sync(std::vector& vec){ - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - this->ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - this->ds.insert(this->get_pos(), this->fhash); - }else{ - this->ds.insert(this->get_pos(), this->rhash); - } - this->ds.min_syncmer(vec); - - this->roll_one(); - } - - template - void Syncmer::roll_ds_sync(std::vector>& vec){ - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - this->ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - this->ds.insert(this->get_pos(), this->fhash); - }else{ - this->ds.insert(this->get_pos(), this->rhash); - } - this->ds.min_syncmer(vec); - - this->roll_one(); - } -} \ No newline at end of file diff --git a/include/digest/thread_out.hpp b/include/digest/thread_out.hpp index 63e51e2..7846cf3 100644 --- a/include/digest/thread_out.hpp +++ b/include/digest/thread_out.hpp @@ -5,34 +5,121 @@ #include "digest/syncmer.hpp" #include "digest/window_minimizer.hpp" #include +#include #include #include -/* - Possible implementation for multi-threading the digestion of a single - sequence. The key thing to note is basically by carefully telling where each - digester should start digesting you can make it so each kmer is only - considered once. I have very little experience with threading, so you could - probably thread this out better than me - - IMPORTANT: This approach will not generate correct results for sequences - that contain non-ACTG characters. Take this example, seq = ACTGANACNACTGA, k - = 4, l_wind = 4, thread_count = 2, there is a total of 4 valid kmers in this - sequence, and thus only 1 valid large window, but we can't know this until it - actually goes through the sequence, so it's going to try to partition the - sequence into ACTGANACNA, and ANACNACTGA and feed it into 2 digester objects - which now each have 0 valid large windows -*/ -namespace thread_out { +/** + * + * @brief Possible implementation for multi-threading the digestion of a single + * sequence. The key thing to note is basically by carefully telling where each + * digester should start digesting you can make it so each kmer is only + * considered once. + * For more details on a function, click on more and it will take you to the + * description that is located in modules + * + * @par IMPORTANT: + * This approach will not generate correct results for sequences + * that contain non-ACTG characters. Take this example, seq = ACTGANACNACTGA, k + * = 4, l_wind = 4, thread_count = 2, there is a total of 4 valid kmers in this + * sequence, and thus only 1 valid large window, but we can't know this until it + * actually goes through the sequence, so it's going to try to partition the + * sequence into ACTGANACNA, and ANACNACTGA and feed it into 2 digester objects + * which now each have 0 valid large windows + */ +namespace digest::thread_out { +/** + * @brief Exception thrown when invalid parameters are passed to the thread + * functions + */ class BadThreadOutParams : public std::exception { - const char *what() const throw() { - return "k must be greater than 3, start must be less than len, \ + const char *what() const throw() { + return "k must be greater than 3, start must be less than len, \ and num threads must be greater or equal to the number of kmers/large windows \ large_wind_kmer_am can't be 0"; - } + } }; +//------------- WORKER FUNCTIONS ---------------- + +// function that's passed to the thread for ModMinmizers +template +std::vector thread_mod_roll1(const char *seq, size_t ind, unsigned k, + uint32_t mod, uint32_t congruence, + digest::MinimizedHashType minimized_h, + unsigned assigned_kmer_am) { + std::vector out; + digest::ModMin

dig(seq, ind + assigned_kmer_am + k - 1, k, mod, + congruence, ind, minimized_h); + dig.roll_minimizer(assigned_kmer_am, out); + return out; +} + +template +std::vector> +thread_mod_roll2(const char *seq, size_t ind, unsigned k, uint32_t mod, + uint32_t congruence, digest::MinimizedHashType minimized_h, + unsigned assigned_kmer_am) { + std::vector> out; + digest::ModMin

dig(seq, ind + assigned_kmer_am + k - 1, k, mod, + congruence, ind, minimized_h); + dig.roll_minimizer(assigned_kmer_am, out); + return out; +} + +// function that's passed to the thread for WindowMinimizers +template +std::vector thread_wind_roll1(const char *seq, size_t ind, unsigned k, + uint32_t large_wind_kmer_am, + digest::MinimizedHashType minimized_h, + unsigned assigned_lwind_am) { + std::vector out; + digest::WindowMin dig( + seq, ind + assigned_lwind_am + k + large_wind_kmer_am - 1 - 1, k, + large_wind_kmer_am, ind, minimized_h); + dig.roll_minimizer(assigned_lwind_am, out); + return out; +} + +template +std::vector> thread_wind_roll2( + const char *seq, size_t ind, unsigned k, uint32_t large_wind_kmer_am, + digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am) { + std::vector> out; + digest::WindowMin dig( + seq, ind + assigned_lwind_am + k + large_wind_kmer_am - 1 - 1, k, + large_wind_kmer_am, ind, minimized_h); + dig.roll_minimizer(assigned_lwind_am, out); + return out; +} + +// function that's passed to the thread for Syncmers +template +std::vector thread_sync_roll1(const char *seq, size_t ind, unsigned k, + uint32_t large_wind_kmer_am, + digest::MinimizedHashType minimized_h, + unsigned assigned_lwind_am) { + std::vector out; + digest::Syncmer dig( + seq, ind + assigned_lwind_am + k + large_wind_kmer_am - 1 - 1, k, + large_wind_kmer_am, ind, minimized_h); + dig.roll_minimizer(assigned_lwind_am, out); + return out; +} + +template +std::vector> thread_sync_roll2( + const char *seq, size_t ind, unsigned k, uint32_t large_wind_kmer_am, + digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am) { + std::vector> out; + digest::Syncmer dig( + seq, ind + assigned_lwind_am + k + large_wind_kmer_am - 1 - 1, k, + large_wind_kmer_am, ind, minimized_h); + dig.roll_minimizer(assigned_lwind_am, out); + return out; +} + /** * @param thread_count the number of threads to use * @param vec a vector of vectors in which the minimizers will be placed. @@ -55,42 +142,126 @@ class BadThreadOutParams : public std::exception { */ template void thread_mod( - unsigned thread_count, std::vector> &vec, - const char *seq, size_t len, unsigned k, uint32_t mod, - uint32_t congruence = 0, size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, std::vector> &vec, + const char *seq, size_t len, unsigned k, uint32_t mod, + uint32_t congruence = 0, size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + int num_kmers = (int)len - (int)start - (int)k + 1; + if (k < 4 || start >= len || num_kmers < 0 || + (unsigned)num_kmers < thread_count) { + throw BadThreadOutParams(); + } + unsigned kmers_per_thread = num_kmers / thread_count; + unsigned extras = num_kmers % thread_count; + vec.reserve(thread_count); + std::vector>> thread_vector; + + size_t ind = start; + for (unsigned i = 0; i < thread_count; i++) { + // issue is here + // this will lead to a leak + unsigned assigned_kmer_am = kmers_per_thread; + if (extras > 0) { + ++(assigned_kmer_am); + extras--; + } + + thread_vector.emplace_back(std::async(thread_mod_roll1

, seq, ind, k, + mod, congruence, minimized_h, + assigned_kmer_am)); + + ind += assigned_kmer_am; + } + for (auto &t : thread_vector) { + vec.emplace_back(t.get()); + } +} /** + * @brief same as the other thread_mod, except it can take a C++ string, and + * does not need to be provided the length of the string + * * @param seq C++ string of DNA sequence to be hashed. */ template void thread_mod( - unsigned thread_count, std::vector> &vec, - const std::string &seq, unsigned k, uint32_t mod, uint32_t congruence = 0, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, std::vector> &vec, + const std::string &seq, unsigned k, uint32_t mod, uint32_t congruence = 0, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + thread_mod

(thread_count, vec, seq.c_str(), seq.size(), k, mod, + congruence, start, minimized_h); +} /** + * @brief same as other thread_mod that takes a c-string, + * except here vec is a vector of vectors of pairs of uint32_ts + * * @param vec vec will contain both the index and the hash of minimizers. * All other things previously stated about vec remain true */ template void thread_mod( - unsigned thread_count, - std::vector>> &vec, - const char *seq, size_t len, unsigned k, uint32_t mod, - uint32_t congruence = 0, size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, + std::vector>> &vec, + const char *seq, size_t len, unsigned k, uint32_t mod, + uint32_t congruence = 0, size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + int num_kmers = (int)len - (int)start - (int)k + 1; + if (k < 4 || start >= len || num_kmers < 0 || + (unsigned)num_kmers < thread_count) { + throw BadThreadOutParams(); + } + unsigned kmers_per_thread = num_kmers / thread_count; + unsigned extras = num_kmers % thread_count; + vec.reserve(thread_count); + std::vector>>> + thread_vector; + + size_t ind = start; + for (unsigned i = 0; i < thread_count; i++) { + // issue is here + // this will lead to a leak + unsigned assigned_kmer_am = kmers_per_thread; + if (extras > 0) { + ++(assigned_kmer_am); + extras--; + } + thread_vector.emplace_back(std::async(thread_mod_roll2

, seq, ind, k, + mod, congruence, minimized_h, + assigned_kmer_am)); + + ind += assigned_kmer_am; + } + for (auto &t : thread_vector) { + vec.emplace_back(t.get()); + } +} + +/** + * @brief same as other thread_mod that takes a C++ string, + * except here vec is a vector of vectors of pairs of uint32_ts + * + * @param vec vec will contain both the index and the hash of minimizers. + * All other things previously stated about vec remain true + */ template void thread_mod( - unsigned thread_count, - std::vector>> &vec, - const std::string &seq, unsigned k, uint32_t mod, uint32_t congruence = 0, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, + std::vector>> &vec, + const std::string &seq, unsigned k, uint32_t mod, uint32_t congruence = 0, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + thread_mod

(thread_count, vec, seq.c_str(), seq.size(), k, mod, + congruence, start, minimized_h); +} /** + * @tparam P policy for dealing with non-ACTG characters + * @tparam T min query data structure to use, refer to docs of the classes in + * the ds namespace for more info + * * @param thread_count the number of threads to use * @param vec a vector of vectors in which the minimizers will be placed. * Each vector corresponds to one thread. The minimizers within each vector @@ -108,45 +279,149 @@ void thread_mod( * * @throws BadThreadOutParams */ -// number of k-mers to be considered in the large window template void thread_wind( - unsigned thread_count, std::vector> &vec, - const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, std::vector> &vec, + const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + int num_lwinds = (int)len - (int)start - (int)(k + large_wind_kmer_am) + 2; + if (large_wind_kmer_am == 0 || k < 4 || start >= len || num_lwinds < 0 || + (unsigned)num_lwinds < thread_count) { + throw BadThreadOutParams(); + } + unsigned lwinds_per_thread = num_lwinds / thread_count; + unsigned extras = num_lwinds % thread_count; + vec.reserve(thread_count); + std::vector>> thread_vector; + + size_t ind = start; + for (unsigned i = 0; i < thread_count; i++) { + // issue is here + // this will lead to a leak + unsigned assigned_lwind_am = lwinds_per_thread; + if (extras > 0) { + ++(assigned_lwind_am); + extras--; + } + + thread_vector.emplace_back(std::async(thread_wind_roll1, seq, ind, + k, large_wind_kmer_am, + minimized_h, assigned_lwind_am)); + + ind += assigned_lwind_am; + } + for (auto &t : thread_vector) { + vec.emplace_back(t.get()); + } + + // handle duplicates + // the only possible place for a duplicate is for the last element + // of vec[i] to equal the first value of vec[i+1] due to the fact + // that thread_i+1 can't know the last minimizer of thread_i + for (unsigned i = 0; i < thread_count - 1; i++) { + int last = (int)vec[i].size() - 1; + if (vec[i][last] == vec[i + 1][0]) { + vec[i].pop_back(); + } + } +} /** + * @brief same as the other thread_wind, except it can take a C++ string, and + * does not need to be provided the length of the string + * * @param seq C++ string of DNA sequence to be hashed. */ template void thread_wind( - unsigned thread_count, std::vector> &vec, - const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, std::vector> &vec, + const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + thread_wind(thread_count, vec, seq.c_str(), seq.size(), k, + large_wind_kmer_am, start, minimized_h); +} /** + * @brief same as other thread_wind that takes a c-string, + * except here vec is a vector of vectors of pairs of uint32_ts + * * @param vec vec will contain both the index and the hash of minimizers. * All other things previously stated about vec remain true */ template void thread_wind( - unsigned thread_count, - std::vector>> &vec, - const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, + std::vector>> &vec, + const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + int num_lwinds = (int)len - (int)start - (int)(k + large_wind_kmer_am) + 2; + if (large_wind_kmer_am == 0 || k < 4 || start >= len || num_lwinds < 0 || + (unsigned)num_lwinds < thread_count) { + throw BadThreadOutParams(); + } + unsigned lwinds_per_thread = num_lwinds / thread_count; + unsigned extras = num_lwinds % thread_count; + vec.reserve(thread_count); + std::vector>> thread_vector; + + size_t ind = start; + for (unsigned i = 0; i < thread_count; i++) { + // issue is here + // this will lead to a leak + unsigned assigned_lwind_am = lwinds_per_thread; + if (extras > 0) { + ++(assigned_lwind_am); + extras--; + } + + thread_vector.emplace_back(std::async(thread_wind_roll2, seq, ind, + k, large_wind_kmer_am, + minimized_h, assigned_lwind_am)); + + ind += assigned_lwind_am; + } + for (auto &t : thread_vector) { + vec.emplace_back(t.get()); + } + // handle duplicates + // the only possible place for a duplicate is for the last element + // of vec[i] to equal the first value of vec[i+1] due to the fact + // that thread_i+1 can't know the last minimizer of thread_i + for (unsigned i = 0; i < thread_count - 1; i++) { + int last = (int)vec[i].size() - 1; + if (vec[i][last] == vec[i + 1][0]) { + vec[i].pop_back(); + } + } +} + +/** + * @brief same as other thread_wind that takes a C++ string, + * except here vec is a vector of vectors of pairs of uint32_ts + * + * @param vec vec will contain both the index and the hash of minimizers. + * All other things previously stated about vec remain true + */ template void thread_wind( - unsigned thread_count, - std::vector>> &vec, - const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, + std::vector>> &vec, + const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + thread_wind(thread_count, vec, seq.c_str(), seq.size(), k, + large_wind_kmer_am, start, minimized_h); +} /** + * @tparam P policy for dealing with non-ACTG characters + * @tparam T min query data structure to use, refer to docs of the classes in + * the ds namespace for more info + * * @param thread_count the number of threads to use * @param vec a vector of vectors in which the minimizers will be placed. * Each vector corresponds to one thread. The minimizers within each vector @@ -164,84 +439,124 @@ void thread_wind( * * @throws BadThreadOutParams */ -// number of k-mers to be considered in the large window template void thread_sync( - unsigned thread_count, std::vector> &vec, - const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, std::vector> &vec, + const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + int num_lwinds = (int)len - (int)start - (int)(k + large_wind_kmer_am) + 2; + if (large_wind_kmer_am == 0 || k < 4 || start >= len || num_lwinds < 0 || + (unsigned)num_lwinds < thread_count) { + throw BadThreadOutParams(); + } + unsigned lwinds_per_thread = num_lwinds / thread_count; + unsigned extras = num_lwinds % thread_count; + vec.reserve(thread_count); + std::vector>> thread_vector; + + size_t ind = start; + for (unsigned i = 0; i < thread_count; i++) { + // issue is here + // this will lead to a leak + unsigned assigned_lwind_am = lwinds_per_thread; + if (extras > 0) { + ++(assigned_lwind_am); + extras--; + } + + thread_vector.emplace_back(std::async(thread_sync_roll1, seq, ind, + k, large_wind_kmer_am, + minimized_h, assigned_lwind_am)); + + ind += assigned_lwind_am; + } + for (auto &t : thread_vector) { + + vec.emplace_back(t.get()); + } +} /** + * @brief same as the other thread_sync, except it can take a C++ string, and + * does not need to be provided the length of the string + * * @param seq C++ string of DNA sequence to be hashed. */ template void thread_sync( - unsigned thread_count, std::vector> &vec, - const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, std::vector> &vec, + const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + thread_sync(thread_count, vec, seq.c_str(), seq.size(), k, + large_wind_kmer_am, start, minimized_h); +} /** + * @brief same as other thread_wind that takes a c-string, + * except here vec is a vector of vectors of pairs of uint32_ts + * * @param vec vec will contain both the index and the hash of minimizers. * All other things previously stated about vec remain true */ template void thread_sync( - unsigned thread_count, - std::vector>> &vec, - const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + unsigned thread_count, + std::vector>> &vec, + const char *seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + int num_lwinds = (int)len - (int)start - (int)(k + large_wind_kmer_am) + 2; + if (large_wind_kmer_am == 0 || k < 4 || start >= len || num_lwinds < 0 || + (unsigned)num_lwinds < thread_count) { + throw BadThreadOutParams(); + } + unsigned lwinds_per_thread = num_lwinds / thread_count; + unsigned extras = num_lwinds % thread_count; + vec.reserve(thread_count); + std::vector>>> + thread_vector; -template -void thread_sync( - unsigned thread_count, - std::vector>> &vec, - const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, - size_t start = 0, - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON); + size_t ind = start; + for (unsigned i = 0; i < thread_count; i++) { + // issue is here + // this will lead to a leak + unsigned assigned_lwind_am = lwinds_per_thread; + if (extras > 0) { + ++(assigned_lwind_am); + extras--; + } -//------------- WORKER FUNCTIONS ---------------- + thread_vector.emplace_back(std::async(thread_sync_roll2, seq, ind, + k, large_wind_kmer_am, + minimized_h, assigned_lwind_am)); -// function that's passed to the thread for ModMinmizers -template -std::vector thread_mod_roll1(const char *seq, size_t ind, unsigned k, - uint32_t mod, uint32_t congruence, - digest::MinimizedHashType minimized_h, - unsigned assigned_kmer_am); - -template -std::vector> -thread_mod_roll2(const char *seq, size_t ind, unsigned k, uint32_t mod, - uint32_t congruence, digest::MinimizedHashType minimized_h, - unsigned assigned_kmer_am); - -// function that's passed to the thread for WindowMinimizers -template -std::vector thread_wind_roll1(const char *seq, size_t ind, unsigned k, - uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, - unsigned assigned_lwind_am); - -template -std::vector> thread_wind_roll2( - const char *seq, size_t ind, unsigned k, uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am); - -// function that's passed to the thread for Syncmers -template -std::vector thread_sync_roll1(const char *seq, size_t ind, unsigned k, - uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, - unsigned assigned_lwind_am); + ind += assigned_lwind_am; + } + for (auto &t : thread_vector) { + vec.emplace_back(t.get()); + } +} +/** + * @brief same as other thread_sync that takes a C++ string, + * except here vec is a vector of vectors of pairs of uint32_ts + * + * @param vec vec will contain both the index and the hash of minimizers. + * All other things previously stated about vec remain true + */ template -std::vector> thread_sync_roll2( - const char *seq, size_t ind, unsigned k, uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am); +void thread_sync( + unsigned thread_count, + std::vector>> &vec, + const std::string &seq, unsigned k, uint32_t large_wind_kmer_am, + size_t start = 0, + digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON) { + thread_sync(thread_count, vec, seq.c_str(), seq.size(), k, + large_wind_kmer_am, start, minimized_h); +} -} // namespace thread_out +} // namespace digest::thread_out -#include "thread_out.tpp" -#endif +#endif // THREAD_OUT_HPP diff --git a/include/digest/thread_out.tpp b/include/digest/thread_out.tpp deleted file mode 100644 index 98a2188..0000000 --- a/include/digest/thread_out.tpp +++ /dev/null @@ -1,337 +0,0 @@ -#include "digest/thread_out.hpp" -#include - -namespace thread_out -{ -template -void thread_mod(unsigned thread_count, std::vector>& vec, - const char* seq, size_t len, unsigned k, uint32_t mod, uint32_t congruence, size_t start, - digest::MinimizedHashType minimized_h){ - int num_kmers = (int)len - (int)start - (int)k + 1; - if(k < 4 ||start >= len || num_kmers < 0 || (unsigned)num_kmers < thread_count){ - throw BadThreadOutParams(); - } - unsigned kmers_per_thread = num_kmers/thread_count; - unsigned extras = num_kmers % thread_count; - vec.reserve(thread_count); - std::vector>> thread_vector; - - size_t ind = start; - for(unsigned i = 0; i < thread_count; i++){ - // issue is here - // this will lead to a leak - unsigned assigned_kmer_am = kmers_per_thread; - if(extras > 0){ - ++(assigned_kmer_am); - extras--; - } - - thread_vector.emplace_back(std::async(thread_mod_roll1

, - seq, ind, k, mod, congruence, minimized_h, assigned_kmer_am)); - - ind += assigned_kmer_am; - } - for(auto& t: thread_vector) - { - vec.emplace_back(t.get()); - } - } - -template -void thread_mod(unsigned thread_count, std::vector>>& vec, - const char* seq, size_t len, unsigned k, uint32_t mod, uint32_t congruence, size_t start, - digest::MinimizedHashType minimized_h){ - int num_kmers = (int)len - (int)start - (int)k + 1; - if(k < 4 ||start >= len || num_kmers < 0 || (unsigned)num_kmers < thread_count){ - throw BadThreadOutParams(); - } - unsigned kmers_per_thread = num_kmers/thread_count; - unsigned extras = num_kmers % thread_count; - vec.reserve(thread_count); - std::vector>>> thread_vector; - - size_t ind = start; - for(unsigned i = 0; i < thread_count; i++){ - // issue is here - // this will lead to a leak - unsigned assigned_kmer_am = kmers_per_thread; - if(extras > 0){ - ++(assigned_kmer_am); - extras--; - } - - thread_vector.emplace_back(std::async(thread_mod_roll2

, - seq, ind, k, mod, congruence, minimized_h, assigned_kmer_am)); - - ind += assigned_kmer_am; - } - for(auto& t: thread_vector) - { - vec.emplace_back(t.get()); - } - } - -template -void thread_mod(unsigned thread_count, std::vector>& vec, - const std::string& seq, unsigned k, uint32_t mod, uint32_t congruence, size_t start, - digest::MinimizedHashType minimized_h){ - thread_mod

(thread_count, vec, seq.c_str(), seq.size(), k, mod, congruence, start, minimized_h); - } - -template -void thread_mod(unsigned thread_count, std::vector>>& vec, - const std::string& seq, unsigned k, uint32_t mod, uint32_t congruence, size_t start, - digest::MinimizedHashType minimized_h){ - thread_mod

(thread_count, vec, seq.c_str(), seq.size(), k, mod, congruence, start, minimized_h); - } - -template -std::vector thread_mod_roll1(const char* seq, - size_t ind, unsigned k, uint32_t mod, uint32_t congruence, - digest::MinimizedHashType minimized_h, unsigned assigned_kmer_am){ - std::vector out; - digest::ModMin

dig(seq, ind + assigned_kmer_am + k -1, k, mod, congruence, ind, minimized_h); - dig.roll_minimizer(assigned_kmer_am, out); - return out; - } - -template -std::vector> thread_mod_roll2(const char* seq, - size_t ind, unsigned k, uint32_t mod, uint32_t congruence, - digest::MinimizedHashType minimized_h, unsigned assigned_kmer_am){ - std::vector> out; - digest::ModMin

dig(seq, ind + assigned_kmer_am + k -1, k, mod, congruence, ind, minimized_h); - dig.roll_minimizer(assigned_kmer_am, out); - return out; - } - -template -void thread_wind(unsigned thread_count, std::vector>& vec, - const char* seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - int num_lwinds = (int)len - (int)start - (int)(k+large_wind_kmer_am)+2; - if(large_wind_kmer_am == 0 || k < 4 ||start >= len || num_lwinds < 0 || (unsigned)num_lwinds < thread_count){ - throw BadThreadOutParams(); - } - unsigned lwinds_per_thread = num_lwinds/thread_count; - unsigned extras = num_lwinds % thread_count; - vec.reserve(thread_count); - std::vector>> thread_vector; - - size_t ind = start; - for(unsigned i = 0; i < thread_count; i++){ - // issue is here - // this will lead to a leak - unsigned assigned_lwind_am = lwinds_per_thread; - if(extras > 0){ - ++(assigned_lwind_am); - extras--; - } - - thread_vector.emplace_back(std::async(thread_wind_roll1, - seq, ind, k, large_wind_kmer_am, minimized_h, assigned_lwind_am)); - - ind += assigned_lwind_am; - } - for(auto& t: thread_vector) - { - vec.emplace_back(t.get()); - } - - // handle duplicates - // the only possible place for a duplicate is for the last element - // of vec[i] to equal the first value of vec[i+1] due to the fact - // that thread_i+1 can't know the last minimizer of thread_i - for(unsigned i = 0; i < thread_count-1; i++){ - int last = (int)vec[i].size() - 1; - if(vec[i][last] == vec[i+1][0]){ - vec[i].pop_back(); - } - } - } - -template -void thread_wind(unsigned thread_count, std::vector>>& vec, - const char* seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - int num_lwinds = (int)len - (int)start - (int)(k+large_wind_kmer_am)+2; - if(large_wind_kmer_am == 0 || k < 4 ||start >= len || num_lwinds < 0 || (unsigned)num_lwinds < thread_count){ - throw BadThreadOutParams(); - } - unsigned lwinds_per_thread = num_lwinds/thread_count; - unsigned extras = num_lwinds % thread_count; - vec.reserve(thread_count); - std::vector>> thread_vector; - - size_t ind = start; - for(unsigned i = 0; i < thread_count; i++){ - // issue is here - // this will lead to a leak - unsigned assigned_lwind_am = lwinds_per_thread; - if(extras > 0){ - ++(assigned_lwind_am); - extras--; - } - - thread_vector.emplace_back(std::async(thread_wind_roll2, - seq, ind, k, large_wind_kmer_am, minimized_h, assigned_lwind_am)); - - ind += assigned_lwind_am; - } - for(auto& t: thread_vector) - { - vec.emplace_back(t.get()); - } - - // handle duplicates - // the only possible place for a duplicate is for the last element - // of vec[i] to equal the first value of vec[i+1] due to the fact - // that thread_i+1 can't know the last minimizer of thread_i - for(unsigned i = 0; i < thread_count-1; i++){ - int last = (int)vec[i].size() - 1; - if(vec[i][last] == vec[i+1][0]){ - vec[i].pop_back(); - } - } - } - -template -void thread_wind(unsigned thread_count, std::vector>& vec, - const std::string& seq, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - thread_wind(thread_count, vec, seq.c_str(), seq.size(), k, large_wind_kmer_am, start, minimized_h); - } - -template -void thread_wind(unsigned thread_count, std::vector>>& vec, - const std::string& seq, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - thread_wind(thread_count, vec, seq.c_str(), seq.size(), k, large_wind_kmer_am, start, minimized_h); - } - -template -std::vector thread_wind_roll1(const char* seq, - size_t ind, unsigned k, uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am){ - std::vector out; - digest::WindowMin dig(seq, ind + assigned_lwind_am + k + large_wind_kmer_am -1 -1, k, large_wind_kmer_am, ind, minimized_h); - dig.roll_minimizer(assigned_lwind_am, out); - return out; - } - -template -std::vector> thread_wind_roll2(const char* seq, - size_t ind, unsigned k, uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am){ - std::vector> out; - digest::WindowMin dig(seq, ind + assigned_lwind_am + k + large_wind_kmer_am -1 -1, k, large_wind_kmer_am, ind, minimized_h); - dig.roll_minimizer(assigned_lwind_am, out); - return out; - } - -template -void thread_sync(unsigned thread_count, std::vector>& vec, - const char* seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - int num_lwinds = (int)len - (int)start - (int)(k+large_wind_kmer_am)+2; - if(large_wind_kmer_am == 0 || k < 4 ||start >= len || num_lwinds < 0 || (unsigned)num_lwinds < thread_count){ - throw BadThreadOutParams(); - } - unsigned lwinds_per_thread = num_lwinds/thread_count; - unsigned extras = num_lwinds % thread_count; - vec.reserve(thread_count); - std::vector>> thread_vector; - - size_t ind = start; - for(unsigned i = 0; i < thread_count; i++){ - // issue is here - // this will lead to a leak - unsigned assigned_lwind_am = lwinds_per_thread; - if(extras > 0){ - ++(assigned_lwind_am); - extras--; - } - - thread_vector.emplace_back(std::async(thread_sync_roll1, - seq, ind, k, large_wind_kmer_am, minimized_h, assigned_lwind_am)); - - ind += assigned_lwind_am; - } - for(auto& t: thread_vector) - { - vec.emplace_back(t.get()); - } - - } -template -void thread_sync(unsigned thread_count, std::vector>>& vec, - const char* seq, size_t len, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - int num_lwinds = (int)len - (int)start - (int)(k+large_wind_kmer_am)+2; - if(large_wind_kmer_am == 0 || k < 4 ||start >= len || num_lwinds < 0 || (unsigned)num_lwinds < thread_count){ - throw BadThreadOutParams(); - } - unsigned lwinds_per_thread = num_lwinds/thread_count; - unsigned extras = num_lwinds % thread_count; - vec.reserve(thread_count); - std::vector>>> thread_vector; - - size_t ind = start; - for(unsigned i = 0; i < thread_count; i++){ - // issue is here - // this will lead to a leak - unsigned assigned_lwind_am = lwinds_per_thread; - if(extras > 0){ - ++(assigned_lwind_am); - extras--; - } - - thread_vector.emplace_back(std::async(thread_sync_roll2, - seq, ind, k, large_wind_kmer_am, minimized_h, assigned_lwind_am)); - - ind += assigned_lwind_am; - } - for(auto& t: thread_vector) - { - vec.emplace_back(t.get()); - } - - } - -template -void thread_sync(unsigned thread_count, std::vector>& vec, - const std::string& seq, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - thread_sync(thread_count, vec, seq.c_str(), seq.size(), k, large_wind_kmer_am, start, minimized_h); - } - -template -void thread_sync(unsigned thread_count, std::vector>>& vec, - const std::string& seq, unsigned k, uint32_t large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h){ - thread_sync(thread_count, vec, seq.c_str(), seq.size(), k, large_wind_kmer_am, start, minimized_h); - } - -template -std::vector thread_sync_roll1(const char* seq, - size_t ind, unsigned k, uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am){ - std::vector out; - digest::Syncmer dig(seq, ind + assigned_lwind_am + k + large_wind_kmer_am -1 -1, k, large_wind_kmer_am, ind, minimized_h); - dig.roll_minimizer(assigned_lwind_am, out); - return out; - } - -template -std::vector> thread_sync_roll2(const char* seq, - size_t ind, unsigned k, uint32_t large_wind_kmer_am, - digest::MinimizedHashType minimized_h, unsigned assigned_lwind_am){ - std::vector> out; - digest::Syncmer dig(seq, ind + assigned_lwind_am + k + large_wind_kmer_am -1 -1, k, large_wind_kmer_am, ind, minimized_h); - dig.roll_minimizer(assigned_lwind_am, out); - return out; - } -} - - - diff --git a/include/digest/window_minimizer.hpp b/include/digest/window_minimizer.hpp index c9026de..fcb7787 100644 --- a/include/digest/window_minimizer.hpp +++ b/include/digest/window_minimizer.hpp @@ -1,138 +1,245 @@ -#pragma once -#include "data_structure.hpp" -#include "digest/digester.hpp" -#include -#include - -namespace digest { - -class BadWindowSizeException : public std::exception { - const char *what() const throw() { - return "Number of kmers in large window cannot be 0"; - } -}; - -// number of k-mers to be considered in the large window -template class WindowMin : public Digester

{ -public: - /** - * @param seq - * @param len - * @param k - * @param large_window - * @param start - * @param minimized_h - * - * @throws BadWindowException Thrown when congruence is greater or equal to - * mod - */ - WindowMin(const char *seq, size_t len, unsigned k, unsigned large_window, - size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : Digester

(seq, len, k, start, minimized_h), ds(large_window), - large_window(large_window), ds_size(0), is_minimized(false) { - if (large_window == 0) { - throw BadWindowSizeException(); - } - } - - /** - * @param seq - * @param k - * @param large_window - * @param start - * @param minimized_h - * - * @throws BadWindowException Thrown when congruence is greater or equal to - * mod - */ - WindowMin(const std::string &seq, unsigned k, unsigned large_window, - size_t start = 0, - MinimizedHashType minimized_h = MinimizedHashType::CANON) - : WindowMin(seq.c_str(), seq.size(), k, large_window, start, - minimized_h) {} - - /** - * @brief adds up to amount of positions of minimizers into vec, here a k-mer - * is considered a minimizer if its hash is the smallest in the large window, - * using rightmost index wins in ties - * - * @param amount - * @param vec - */ - virtual void roll_minimizer(unsigned amount, - std::vector &vec) override; - - /** - * @brief adds up to amount of positions and hashes of minimizers into vec, - * here a k-mer is considered a minimizer if its hash is the smallest in the - * large window, using rightmost index wins in ties - * - * @param amount - * @param vec - */ - virtual void - roll_minimizer(unsigned amount, - std::vector> &vec) override; - - unsigned get_large_wind_kmer_am() { return large_window; } - - size_t get_ds_size() { return ds_size; } - - // function is mainly to help with tests - bool get_is_minimized() { return is_minimized; } - -protected: - // data structure which will find miminum - T ds; - - uint32_t large_window; - - // internal counter that tracks the number of actual values in the data - // structure - size_t ds_size; - - // internal bool keeping track of if we have obtained the first minimizer yet, - // because we don't want to add a position to the vector if it's already in - // there - bool is_minimized; - - // the index of previous minimizer, a minimizer is only a new minimizer if it - // is different from the previous minimizer - uint32_t prev_mini; - -private: - /** - * @brief helper function which handles adding the next hash into the data - * structure - * - */ - void roll_ds_wind(std::vector &vec); - - /** - * @brief helper function which handles adding the next hash into the data - * structure - * - */ - void roll_ds_wind(std::vector> &vec); - - /** - * @brief helper function that checks to see if the current minimizer is a new - * minimizer, and should thus be added to the vec - * - * @param vec - */ - void check(std::vector &vec); - - /** - * @brief helper function that checks to see if the current minimizer is a new - * minimizer, and should thus be added to the vec - * - * @param vec - */ - void check(std::vector> &vec); -}; - -} // namespace digest - -#include "window_minimizer.tpp" +#ifndef WINDOW_MINIMIZER_HPP +#define WINDOW_MINIMIZER_HPP + +#include "data_structure.hpp" +#include "digest/digester.hpp" +#include +#include + +namespace digest { + +/** + * @brief Exception thrown when initializing a Window Minimizer or Syncmer with + * a large window size of 0. + * + * + */ +class BadWindowSizeException : public std::exception { + const char *what() const throw() { + return "Number of kmers in large window cannot be 0"; + } +}; + +/** + * @brief Child class of Digester that defines a minimizer as a kmer whose hash + * is minimal among those in the large window. Parameters without a description + * are the same as the parameters in the Digester parent class. They are simply + * passed up to the parent constructor. + * + * @tparam P + * @tparam T The data structure to use for performing range minimum queries to + * find the minimal hash value. + */ +template class WindowMin : public Digester

{ + public: + /** + * @param seq + * @param len + * @param k + * @param large_window the number of kmers in the large window, i.e. the + * number of kmers to be considered during the range minimum query. + * @param start + * @param minimized_h + * + * @throws BadWindowException thrown when large_window is passed in as 0 + */ + WindowMin(const char *seq, size_t len, unsigned k, unsigned large_window, + size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : Digester

(seq, len, k, start, minimized_h), ds(large_window), + large_window(large_window), ds_size(0), is_minimized(false) { + if (large_window == 0) { + throw BadWindowSizeException(); + } + } + + /** + * @param seq + * @param k + * @param large_window the number of kmers in the large window, i.e. the + * number of kmers to be considered during the range minimum query. + * @param start + * @param minimized_h + * + * @throws BadWindowException thrown when large_window is passed in as 0 + */ + WindowMin(const std::string &seq, unsigned k, unsigned large_window, + size_t start = 0, + MinimizedHashType minimized_h = MinimizedHashType::CANON) + : WindowMin(seq.c_str(), seq.size(), k, large_window, start, + minimized_h) {} + + /** + * @brief adds up to amount of positions of minimizers into vec. Here a + * k-mer is considered a minimizer if its hash is the smallest in the large + * window. Rightmost index wins in ties + * + * @param amount + * @param vec + */ + void roll_minimizer(unsigned amount, std::vector &vec) override { + amount += vec.size(); + + while (ds_size + 1 < large_window and this->is_valid_hash) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + ds.insert(this->get_pos(), this->fhash); + } else { + ds.insert(this->get_pos(), this->rhash); + } + + this->roll_one(); + ds_size++; + } + + while (this->is_valid_hash and vec.size() < amount) { + roll_ds_wind(vec); + } + } + + /** + * @brief adds up to amount of positions and hashes of minimizers into vec. + * Here a k-mer is considered a minimizer if its hash is the smallest in the + * large window. Rightmost index wins in ties + * + * @param amount + * @param vec + */ + void + roll_minimizer(unsigned amount, + std::vector> &vec) override { + amount += vec.size(); + + while (ds_size + 1 < large_window and this->is_valid_hash) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + ds.insert(this->get_pos(), this->fhash); + } else { + ds.insert(this->get_pos(), this->rhash); + } + + this->roll_one(); + ds_size++; + } + + while (this->is_valid_hash and vec.size() < amount) { + roll_ds_wind(vec); + } + } + + /** + * + * @return unsigned, the value of large_window + */ + unsigned get_large_wind_kmer_am() { return large_window; } + + // function is mainly to help with tests + size_t get_ds_size() { return ds_size; } + + // function is mainly to help with tests + bool get_is_minimized() { return is_minimized; } + + protected: + // data structure which will find miminum + T ds; + + uint32_t large_window; + + // internal counter that tracks the number of actual values in the data + // structure + size_t ds_size; + + // internal bool keeping track of if we have obtained the first minimizer + // yet, because we don't want to add a position to the vector if it's + // already in there + bool is_minimized; + + // the index of previous minimizer, a minimizer is only a new minimizer if + // it is different from the previous minimizer + uint32_t prev_mini; + + private: + /** + * @brief helper function which handles adding the next hash into the data + * structure + * + */ + void roll_ds_wind(std::vector &vec) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + ds.insert(this->get_pos(), this->fhash); + } else { + ds.insert(this->get_pos(), this->rhash); + } + check(vec); + + this->roll_one(); + } + + /** + * @brief helper function which handles adding the next hash into the data + * structure + * + */ + void roll_ds_wind(std::vector> &vec) { + if (this->get_minimized_h() == digest::MinimizedHashType::CANON) { + ds.insert(this->get_pos(), this->chash); + } else if (this->get_minimized_h() == + digest::MinimizedHashType::FORWARD) { + ds.insert(this->get_pos(), this->fhash); + } else { + ds.insert(this->get_pos(), this->rhash); + } + check(vec); + + this->roll_one(); + } + + /** + * @brief helper function that checks to see if the current minimizer is a + * new minimizer, and should thus be added to the vec + * + * @param vec + */ + void check(std::vector &vec) { + if (is_minimized) { + if (ds.min() != prev_mini) { + prev_mini = ds.min(); + vec.emplace_back(prev_mini); + } + } else { + is_minimized = true; + prev_mini = ds.min(); + vec.emplace_back(prev_mini); + } + } + + /** + * @brief helper function that checks to see if the current minimizer is a + * new minimizer, and should thus be added to the vec + * + * @param vec + */ + void check(std::vector> &vec) { + if (is_minimized) { + if (ds.min() != prev_mini) { + prev_mini = ds.min(); + vec.emplace_back(prev_mini, ds.min_hash()); + } + } else { + is_minimized = true; + prev_mini = ds.min(); + vec.emplace_back(prev_mini, ds.min_hash()); + } + } +}; + +} // namespace digest + +#endif // WINDOW_MINIMIZER_HPP diff --git a/include/digest/window_minimizer.tpp b/include/digest/window_minimizer.tpp deleted file mode 100644 index 5d0e1a4..0000000 --- a/include/digest/window_minimizer.tpp +++ /dev/null @@ -1,103 +0,0 @@ -#include "digest/window_minimizer.hpp" - -namespace digest{ - template - void WindowMin::roll_minimizer(unsigned amount, std::vector& vec){ - amount += vec.size(); - - while (ds_size + 1 < large_window and this->is_valid_hash) { - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - ds.insert(this->get_pos(), this->fhash); - }else{ - ds.insert(this->get_pos(), this->rhash); - } - - this->roll_one(); - ds_size++; - } - - while (this->is_valid_hash and vec.size() < amount){ - roll_ds_wind(vec); - } - } - - template - void WindowMin::roll_minimizer(unsigned amount, std::vector>& vec){ - amount += vec.size(); - - while (ds_size + 1 < large_window and this->is_valid_hash) { - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - ds.insert(this->get_pos(), this->fhash); - }else{ - ds.insert(this->get_pos(), this->rhash); - } - - this->roll_one(); - ds_size++; - } - - while (this->is_valid_hash and vec.size() < amount){ - roll_ds_wind(vec); - } - } - - template - void WindowMin::roll_ds_wind(std::vector& vec){ - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - ds.insert(this->get_pos(), this->fhash); - }else{ - ds.insert(this->get_pos(), this->rhash); - } - check(vec); - - this->roll_one(); - } - - template - void WindowMin::roll_ds_wind(std::vector>& vec){ - if(this->get_minimized_h() == digest::MinimizedHashType::CANON){ - ds.insert(this->get_pos(), this->chash); - }else if(this->get_minimized_h() == digest::MinimizedHashType::FORWARD){ - ds.insert(this->get_pos(), this->fhash); - }else{ - ds.insert(this->get_pos(), this->rhash); - } - check(vec); - - this->roll_one(); - } - - template - void WindowMin::check(std::vector& vec){ - if(is_minimized){ - if(ds.min() != prev_mini){ - prev_mini = ds.min(); - vec.emplace_back(prev_mini); - } - }else{ - is_minimized = true; - prev_mini = ds.min(); - vec.emplace_back(prev_mini); - } - } - - template - void WindowMin::check(std::vector>& vec){ - if(is_minimized){ - if(ds.min() != prev_mini){ - prev_mini = ds.min(); - vec.emplace_back(prev_mini, ds.min_hash()); - } - }else{ - is_minimized = true; - prev_mini = ds.min(); - vec.emplace_back(prev_mini, ds.min_hash()); - } - } -} diff --git a/meson.build b/meson.build index d6354a1..4758c0b 100644 --- a/meson.build +++ b/meson.build @@ -18,10 +18,8 @@ include_dirs = [include_directories('include'), nthash.get_variable('include_dir install_headers( 'include/digest/digester.hpp', 'include/digest/mod_minimizer.hpp', 'include/digest/syncmer.hpp', 'include/digest/window_minimizer.hpp', + 'include/digest/thread_out.hpp', 'include/digest/data_structure.hpp', - 'include/digest/syncmer.tpp', 'include/digest/window_minimizer.tpp', - 'include/digest/mod_minimizer.tpp', 'include/digest/digester.tpp', - 'include/digest/thread_out.hpp', 'include/digest/thread_out.tpp', install_dir: 'include/digest' ) diff --git a/tests/bench/benchmark.cpp b/tests/bench/benchmark.cpp index 68c8ce0..2475e20 100644 --- a/tests/bench/benchmark.cpp +++ b/tests/bench/benchmark.cpp @@ -1,329 +1,333 @@ -// perf analysis commands -// perf record --call-graph dwarf bench -// perf report -g - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEFAULT_LARGE_WIND 16 -#define DEFAULT_KMER_LEN 16 -#define DEFAULT_KMER_LEN2 64 -#define STR_LEN 62460029 - -std::string bench_str; -std::string s; -std::string s1; -std::string s2; - -void setupStrings() { - std::string files[] = { - "../tests/bench/chrY.txt", - }; - - for (auto &file : files) { - std::ifstream ifs(file); - ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); - std::string str; - ifs >> bench_str; - s = bench_str; - } -} - -// roll_minimizers grouping -// -------------------------------------------------------------- - -static void BM_NtHashRoll(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); - nthash::NtHash dig(s, 1, state.range(0)); - state.ResumeTiming(); - while (dig.roll()) - benchmark::DoNotOptimize(*dig.hashes()); - } -} -BENCHMARK(BM_NtHashRoll) - ->Args({4}) // spumoni2 - ->Args({15}) // minimap - ->Args({31}) - ->Iterations(16); // kraken v1 - -static void BM_ModMinRoll(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); - digest::ModMin dig(s, state.range(0), 17); - std::vector vec; - vec.reserve(STR_LEN); - state.ResumeTiming(); - - benchmark::DoNotOptimize(vec); - dig.roll_minimizer(STR_LEN, vec); - benchmark::ClobberMemory(); - } -} -BENCHMARK(BM_ModMinRoll) - ->Args({4}) // spumoni2 - ->Args({15}) // minimap - ->Args({31}) // kraken v1 - ->Args({16}) - ->Iterations(16); // comparison for threads - -static void BM_WindowMinRoll(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); -#define WINDOW(k) \ - digest::WindowMin> \ - dig(s, state.range(0), k); \ - std::vector vec; \ - vec.reserve(STR_LEN); \ - state.ResumeTiming(); \ - benchmark::DoNotOptimize(vec); \ - dig.roll_minimizer(STR_LEN, vec); \ - benchmark::ClobberMemory(); - - if (state.range(1) == 11) { - WINDOW(11) - } else if (state.range(1) == 10) { - WINDOW(10) - } else if (state.range(1) == 15) { - WINDOW(15) - } else if (state.range(1) == 16) { - WINDOW(16) - } - } -} -BENCHMARK(BM_WindowMinRoll) - ->Args({4, 11}) // spumoni2 - ->Args({15, 10}) // minimap - ->Args({31, 15}) // kraken v1 - ->Args({16, 16}) - ->Iterations(16); // comparison for threads - -static void BM_SyncmerRoll(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); -#define SYNCMER(k) \ - digest::Syncmer> \ - dig(s, state.range(0), k); \ - std::vector vec; \ - vec.reserve(STR_LEN); \ - state.ResumeTiming(); \ - benchmark::DoNotOptimize(vec); \ - dig.roll_minimizer(STR_LEN, vec); \ - benchmark::ClobberMemory(); - - if (state.range(1) == 12) { - SYNCMER(12) - } else if (state.range(1) == 11) { - SYNCMER(11) - } else if (state.range(1) == 16) { - SYNCMER(16) - } - } -} -BENCHMARK(BM_SyncmerRoll) - ->Args({4, 12}) // spumoni2 - ->Args({15, 11}) // minimap - ->Args({31, 16}) // kraken v1 - ->Args({16, 16}) - ->Iterations(16); // comparison for threads - -// thread benchmarking -// --------------------------------------------------------------------- -static void BM_ThreadMod(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); - std::vector> vec; - state.ResumeTiming(); - - benchmark::DoNotOptimize(vec); - thread_out::thread_mod( - state.range(0), vec, s, DEFAULT_KMER_LEN, 17); - benchmark::ClobberMemory(); - } -} -BENCHMARK(BM_ThreadMod) - ->Args({1}) - ->ArgsProduct({benchmark::CreateDenseRange(2, 64, 2)}) - ->UseRealTime() - ->Iterations(16); - -static void BM_ThreadWind(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); - std::vector> vec; - state.ResumeTiming(); - - benchmark::DoNotOptimize(vec); - thread_out::thread_wind>( - state.range(0), vec, s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); - benchmark::ClobberMemory(); - } -} -BENCHMARK(BM_ThreadWind) - ->Args({1}) - ->ArgsProduct({benchmark::CreateDenseRange(2, 64, 2)}) - ->UseRealTime() - ->Iterations(16); - -static void BM_ThreadSync(benchmark::State &state) { - for (auto _ : state) { - state.PauseTiming(); - std::vector> vec; - state.ResumeTiming(); - - benchmark::DoNotOptimize(vec); - thread_out::thread_sync>( - state.range(0), vec, s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); - benchmark::ClobberMemory(); - } -} -BENCHMARK(BM_ThreadSync) - ->Args({1}) - ->ArgsProduct({benchmark::CreateDenseRange(2, 64, 2)}) - ->UseRealTime() - ->Iterations(16); - -// constructor sanity check grouping -// ----------------------------------------------------- -/* -static void BM_NtHashConstruction(benchmark::State& state){ - for(auto _ : state) { - nthash::NtHash dig(s, 1, DEFAULT_KMER_LEN); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_NtHashConstruction)->Range(1<<6, -1<<18)->Setup(random)->Complexity(); - -static void BM_ModMinConstruction(benchmark::State& state){ - for(auto _ : state) { - digest::ModMin dig(s, DEFAULT_KMER_LEN, 17); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_ModMinConstruction)->Range(1<<6, -1<<18)->Setup(random)->Complexity(); - -static void BM_WindowMinConstructionFixWind(benchmark::State& state){ - for(auto _ : state){ - digest::WindowMin dig(s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_WindowMinConstructionFixWind)->Range(1<<6, -1<<18)->Setup(random)->Complexity(); - -static void BM_WindowMinConstructionFixLen(benchmark::State& state){ - for(auto _ : state){ - digest::WindowMin dig(s, DEFAULT_KMER_LEN, state.range(0)); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_WindowMinConstructionFixLen)->Range(1<<6, -1<<18)->Setup(random)->Complexity(); - -static void BM_SyncmerConstructionFixWind(benchmark::State& state){ - for(auto _ : state){ - digest::Syncmer dig(s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_SyncmerConstructionFixWind)->Range(1<<6, -1<<18)->Setup(random)->Complexity(); - -static void BM_SyncmerConstructionFixLen(benchmark::State& state){ - for(auto _ : state){ - digest::Syncmer dig(s, DEFAULT_KMER_LEN, state.range(0)); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } - state.SetComplexityN(state.range(0)); -} -BENCHMARK(BM_SyncmerConstructionFixLen)->Range(1<<6, -1<<18)->Setup(random)->Complexity(); -*/ - -// append_seq sanity check grouping -// --------------------------------------------------------------- -/* -static void random_append_seq(const benchmark::State& state){ - s1 = bench_strs[0].substr(0, state.range(0)); - s2 = s = bench_strs[0].substr(state.range(0), state.range(1)); -} - -static void BM_ModMinConstruction(benchmark::State& state){ - for(auto _ : state) { - digest::ModMin dig(s, DEFAULT_KMER_LEN2, 17); - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } -} - -static void BM_ModMinRoll(benchmark::State& state) { - for(auto _ : state) { - digest::ModMin dig(s, DEFAULT_KMER_LEN2, 17); - std::vector vec; - vec.reserve(state.range(0)); - - benchmark::DoNotOptimize(vec); - dig.roll_minimizer(state.range(0), vec); - benchmark::ClobberMemory(); - } -} - -static void BM_append_seq(benchmark::State& state){ - - for(auto _ : state) { - digest::ModMin dig(s1, DEFAULT_KMER_LEN2, 17); - dig.append_seq(s2); - - benchmark::DoNotOptimize(dig); - benchmark::ClobberMemory(); - } -} - -static void BM_append_seq_roll(benchmark::State& state){ - - for(auto _ : state) { - digest::ModMin dig(s1, DEFAULT_KMER_LEN2, 17); - dig.append_seq(s2); - std::vector vec; - vec.reserve(state.range(0) + state.range(1)); - - benchmark::DoNotOptimize(vec); - dig.roll_minimizer(state.range(0), vec); - benchmark::ClobberMemory(); - } -} -BENCHMARK(BM_ModMinConstruction)->Arg(127)->Setup(random); -BENCHMARK(BM_append_seq)->Args({63, 64})->Setup(random_append_seq); -BENCHMARK(BM_ModMinRoll)->Arg(127)->Arg(263)->Arg(563)->Setup(random); -BENCHMARK(BM_append_seq_roll)->Args({63, 64})->Args({63, 200})->Args({63, -500})->Setup(random_append_seq); -*/ - -int main(int argc, char **argv) { - setupStrings(); - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); -} +// perf analysis commands +// perf record --call-graph dwarf bench +// perf report -g + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_LARGE_WIND 16 +#define DEFAULT_KMER_LEN 16 +#define DEFAULT_KMER_LEN2 64 +#define STR_LEN 62460029 + +std::string bench_str; +std::string s; +std::string s1; +std::string s2; + +void setupStrings() { + std::string files[] = { + "../tests/bench/chrY.txt", + }; + + for (auto &file : files) { + std::ifstream ifs(file); + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + std::string str; + ifs >> bench_str; + s = bench_str; + } +} + +// roll_minimizers grouping +// -------------------------------------------------------------- + +static void BM_NtHashRoll(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); + nthash::NtHash dig(s, 1, state.range(0)); + state.ResumeTiming(); + while (dig.roll()) + benchmark::DoNotOptimize(*dig.hashes()); + } +} +BENCHMARK(BM_NtHashRoll) + ->Args({4}) // spumoni2 + ->Args({15}) // minimap + ->Args({31}) + ->Iterations(16); // kraken v1 + +static void BM_ModMinRoll(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); + digest::ModMin dig(s, state.range(0), + 17); + std::vector vec; + vec.reserve(STR_LEN); + state.ResumeTiming(); + + benchmark::DoNotOptimize(vec); + dig.roll_minimizer(STR_LEN, vec); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_ModMinRoll) + ->Args({4}) // spumoni2 + ->Args({15}) // minimap + ->Args({31}) // kraken v1 + ->Args({16}) + ->Iterations(16); // comparison for threads + +static void BM_WindowMinRoll(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); +#define WINDOW(k) \ + digest::WindowMin> \ + dig(s, state.range(0), k); \ + std::vector vec; \ + vec.reserve(STR_LEN); \ + state.ResumeTiming(); \ + benchmark::DoNotOptimize(vec); \ + dig.roll_minimizer(STR_LEN, vec); \ + benchmark::ClobberMemory(); + + if (state.range(1) == 11) { + WINDOW(11) + } else if (state.range(1) == 10) { + WINDOW(10) + } else if (state.range(1) == 15) { + WINDOW(15) + } else if (state.range(1) == 16) { + WINDOW(16) + } + } +} +BENCHMARK(BM_WindowMinRoll) + ->Args({4, 11}) // spumoni2 + ->Args({15, 10}) // minimap + ->Args({31, 15}) // kraken v1 + ->Args({16, 16}) + ->Iterations(16); // comparison for threads + +static void BM_SyncmerRoll(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); +#define SYNCMER(k) \ + digest::Syncmer> \ + dig(s, state.range(0), k); \ + std::vector vec; \ + vec.reserve(STR_LEN); \ + state.ResumeTiming(); \ + benchmark::DoNotOptimize(vec); \ + dig.roll_minimizer(STR_LEN, vec); \ + benchmark::ClobberMemory(); + + if (state.range(1) == 12) { + SYNCMER(12) + } else if (state.range(1) == 11) { + SYNCMER(11) + } else if (state.range(1) == 16) { + SYNCMER(16) + } + } +} +BENCHMARK(BM_SyncmerRoll) + ->Args({4, 12}) // spumoni2 + ->Args({15, 11}) // minimap + ->Args({31, 16}) // kraken v1 + ->Args({16, 16}) + ->Iterations(16); // comparison for threads + +// thread benchmarking +// --------------------------------------------------------------------- +static void BM_ThreadMod(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); + std::vector> vec; + state.ResumeTiming(); + + benchmark::DoNotOptimize(vec); + digest::thread_out::thread_mod( + state.range(0), vec, s, DEFAULT_KMER_LEN, 17); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_ThreadMod) + ->Args({1}) + ->ArgsProduct({benchmark::CreateDenseRange(2, 64, 2)}) + ->UseRealTime() + ->Iterations(16); + +static void BM_ThreadWind(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); + std::vector> vec; + state.ResumeTiming(); + + benchmark::DoNotOptimize(vec); + digest::thread_out::thread_wind< + digest::BadCharPolicy::SKIPOVER, + digest::ds::SegmentTree>( + state.range(0), vec, s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_ThreadWind) + ->Args({1}) + ->ArgsProduct({benchmark::CreateDenseRange(2, 64, 2)}) + ->UseRealTime() + ->Iterations(16); + +static void BM_ThreadSync(benchmark::State &state) { + for (auto _ : state) { + state.PauseTiming(); + std::vector> vec; + state.ResumeTiming(); + + benchmark::DoNotOptimize(vec); + digest::thread_out::thread_sync< + digest::BadCharPolicy::SKIPOVER, + digest::ds::SegmentTree>( + state.range(0), vec, s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_ThreadSync) + ->Args({1}) + ->ArgsProduct({benchmark::CreateDenseRange(2, 64, 2)}) + ->UseRealTime() + ->Iterations(16); + +// constructor sanity check grouping +// ----------------------------------------------------- +/* +static void BM_NtHashConstruction(benchmark::State& state){ + for(auto _ : state) { + nthash::NtHash dig(s, 1, DEFAULT_KMER_LEN); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } + state.SetComplexityN(state.range(0)); +} +BENCHMARK(BM_NtHashConstruction)->Range(1<<6, +1<<18)->Setup(random)->Complexity(); + +static void BM_ModMinConstruction(benchmark::State& state){ + for(auto _ : state) { + digest::ModMin dig(s, DEFAULT_KMER_LEN, 17); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } + state.SetComplexityN(state.range(0)); +} +BENCHMARK(BM_ModMinConstruction)->Range(1<<6, +1<<18)->Setup(random)->Complexity(); + +static void BM_WindowMinConstructionFixWind(benchmark::State& state){ + for(auto _ : state){ + digest::WindowMin dig(s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } + state.SetComplexityN(state.range(0)); +} +BENCHMARK(BM_WindowMinConstructionFixWind)->Range(1<<6, +1<<18)->Setup(random)->Complexity(); + +static void BM_WindowMinConstructionFixLen(benchmark::State& state){ + for(auto _ : state){ + digest::WindowMin dig(s, DEFAULT_KMER_LEN, state.range(0)); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } + state.SetComplexityN(state.range(0)); +} +BENCHMARK(BM_WindowMinConstructionFixLen)->Range(1<<6, +1<<18)->Setup(random)->Complexity(); + +static void BM_SyncmerConstructionFixWind(benchmark::State& state){ + for(auto _ : state){ + digest::Syncmer dig(s, DEFAULT_KMER_LEN, DEFAULT_LARGE_WIND); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } + state.SetComplexityN(state.range(0)); +} +BENCHMARK(BM_SyncmerConstructionFixWind)->Range(1<<6, +1<<18)->Setup(random)->Complexity(); + +static void BM_SyncmerConstructionFixLen(benchmark::State& state){ + for(auto _ : state){ + digest::Syncmer dig(s, DEFAULT_KMER_LEN, state.range(0)); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } + state.SetComplexityN(state.range(0)); +} +BENCHMARK(BM_SyncmerConstructionFixLen)->Range(1<<6, +1<<18)->Setup(random)->Complexity(); +*/ + +// append_seq sanity check grouping +// --------------------------------------------------------------- +/* +static void random_append_seq(const benchmark::State& state){ + s1 = bench_strs[0].substr(0, state.range(0)); + s2 = s = bench_strs[0].substr(state.range(0), state.range(1)); +} + +static void BM_ModMinConstruction(benchmark::State& state){ + for(auto _ : state) { + digest::ModMin dig(s, DEFAULT_KMER_LEN2, 17); + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } +} + +static void BM_ModMinRoll(benchmark::State& state) { + for(auto _ : state) { + digest::ModMin dig(s, DEFAULT_KMER_LEN2, 17); + std::vector vec; + vec.reserve(state.range(0)); + + benchmark::DoNotOptimize(vec); + dig.roll_minimizer(state.range(0), vec); + benchmark::ClobberMemory(); + } +} + +static void BM_append_seq(benchmark::State& state){ + + for(auto _ : state) { + digest::ModMin dig(s1, DEFAULT_KMER_LEN2, 17); + dig.append_seq(s2); + + benchmark::DoNotOptimize(dig); + benchmark::ClobberMemory(); + } +} + +static void BM_append_seq_roll(benchmark::State& state){ + + for(auto _ : state) { + digest::ModMin dig(s1, DEFAULT_KMER_LEN2, 17); + dig.append_seq(s2); + std::vector vec; + vec.reserve(state.range(0) + state.range(1)); + + benchmark::DoNotOptimize(vec); + dig.roll_minimizer(state.range(0), vec); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_ModMinConstruction)->Arg(127)->Setup(random); +BENCHMARK(BM_append_seq)->Args({63, 64})->Setup(random_append_seq); +BENCHMARK(BM_ModMinRoll)->Arg(127)->Arg(263)->Arg(563)->Setup(random); +BENCHMARK(BM_append_seq_roll)->Args({63, 64})->Args({63, 200})->Args({63, +500})->Setup(random_append_seq); +*/ + +int main(int argc, char **argv) { + setupStrings(); + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); +} diff --git a/tests/bench/str_gen.cpp b/tests/bench/str_gen.cpp index 19e452d..ff8c01f 100644 --- a/tests/bench/str_gen.cpp +++ b/tests/bench/str_gen.cpp @@ -1,109 +1,109 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; -typedef long long ll; -// if you end up using long double, you need to set the floating point notation -// to fixed, and set the percision to be very high -typedef long double ld; - -// contrsuct umaps like this, unordered_map -// safe_map; FIXED_RANDOM is static so it doesn not get redeclared between -// function calls -struct custom_hash { - static uint64_t splitmix64(uint64_t x) { - // http://xorshift.di.unimi.it/splitmix64.c - x += 0x9e3779b97f4a7c15; - x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; - x = (x ^ (x >> 27)) * 0x94d049bb133111eb; - return x ^ (x >> 31); - } - - size_t operator()(uint64_t x) const { - - static const uint64_t FIXED_RANDOM = - chrono::steady_clock::now().time_since_epoch().count(); - return splitmix64(x + FIXED_RANDOM); - } -}; - -#define INF 2001001001 -#define INF2 2e18 -#define MOD 1000000007 - -#define f0r(a, b) for (long long a = 0; a < b; a++) -#define f1r(a, b, c) for (long long a = b; a < c; a++) -#define max3(a, b, c) max(a, max(b, c)) -#define min3(a, b, c) min(a, min(b, c)) -#define pb push_back -#define pf push_front -#define f first -#define s second -#define mp make_pair -#define pll pair -#define pii pair -#define tp make_tuple - -// first four are north, west, east ,south -int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; -int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; - -int main() { - - // use this if you read in from a file - - // freopen("in.txt", "r", stdin); - freopen("ACTG.txt", "w", stdout); - - char chars[4] = {'A', 'C', 'T', 'G'}; - for (int i = 0; i < 1; i++) { - string temp; - for (int j = 0; j < 1e7; j++) { - int curr = rand() % 4; - temp.pb(chars[curr]); - } - cout << temp << endl; - cout << endl; - } - /* - freopen("non-ACTG.txt", "w", stdout); - char chars2[5] = {'A', 'C', 'T', 'G', 'N'}; - for(int i =0; i < 100; i++){ - string temp; - for(int j = 0; j < 1e5; j++){ - int curr = rand() % 33; - if(curr == 32){ - temp.pb(chars2[4]); - }else{ - curr %= 4; - temp.pb(chars2[curr]); - } - } - cout << temp << endl; - cout << endl; - } - */ - return 0; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +typedef long long ll; +// if you end up using long double, you need to set the floating point notation +// to fixed, and set the percision to be very high +typedef long double ld; + +// contrsuct umaps like this, unordered_map +// safe_map; FIXED_RANDOM is static so it doesn not get redeclared between +// function calls +struct custom_hash { + static uint64_t splitmix64(uint64_t x) { + // http://xorshift.di.unimi.it/splitmix64.c + x += 0x9e3779b97f4a7c15; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; + x = (x ^ (x >> 27)) * 0x94d049bb133111eb; + return x ^ (x >> 31); + } + + size_t operator()(uint64_t x) const { + + static const uint64_t FIXED_RANDOM = + chrono::steady_clock::now().time_since_epoch().count(); + return splitmix64(x + FIXED_RANDOM); + } +}; + +#define INF 2001001001 +#define INF2 2e18 +#define MOD 1000000007 + +#define f0r(a, b) for (long long a = 0; a < b; a++) +#define f1r(a, b, c) for (long long a = b; a < c; a++) +#define max3(a, b, c) max(a, max(b, c)) +#define min3(a, b, c) min(a, min(b, c)) +#define pb push_back +#define pf push_front +#define f first +#define s second +#define mp make_pair +#define pll pair +#define pii pair +#define tp make_tuple + +// first four are north, west, east ,south +int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; +int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; + +int main() { + + // use this if you read in from a file + + // freopen("in.txt", "r", stdin); + freopen("ACTG.txt", "w", stdout); + + char chars[4] = {'A', 'C', 'T', 'G'}; + for (int i = 0; i < 1; i++) { + string temp; + for (int j = 0; j < 1e7; j++) { + int curr = rand() % 4; + temp.pb(chars[curr]); + } + cout << temp << endl; + cout << endl; + } + /* + freopen("non-ACTG.txt", "w", stdout); + char chars2[5] = {'A', 'C', 'T', 'G', 'N'}; + for(int i =0; i < 100; i++){ + string temp; + for(int j = 0; j < 1e5; j++){ + int curr = rand() % 33; + if(curr == 32){ + temp.pb(chars2[4]); + }else{ + curr %= 4; + temp.pb(chars2[curr]); + } + } + cout << temp << endl; + cout << endl; + } + */ + return 0; } \ No newline at end of file diff --git a/tests/data_structure/bench_ds.cpp b/tests/data_structure/bench_ds.cpp index 7a099ac..a89af7f 100644 --- a/tests/data_structure/bench_ds.cpp +++ b/tests/data_structure/bench_ds.cpp @@ -1,174 +1,172 @@ -#include -#include - -#include -#include -#include -#include - -// segtee wins at 12 -// naive2 wins at 17 - -namespace digest::ds { - -template struct MonoQueue { - int head = 0, tail = 0; - // one extra slot so empty() works when full - // {hash, index, time} - std::array, 3> queue; - uint32_t time = 0; - - bool empty() { return head == tail; } - - MonoQueue(uint32_t) { - queue[2].fill(0); // necessary to avoid 1 in a billion collision - queue[0].fill(0); // gets rid of warning - } - MonoQueue(const MonoQueue &other) = default; - MonoQueue &operator=(const MonoQueue &other) = default; - - void insert(uint32_t index, uint32_t hash) { - if (queue[2][head] == time - k) { - if (++head == k + 1) { - head = 0; - } - } - - while (not empty() and queue[0][tail == 0 ? k : tail - 1] >= hash) { - if (--tail == -1) { - tail = k; - } - } - - queue[0][tail] = hash; - queue[1][tail] = index; - queue[2][tail] = time++; - - if (++tail == k + 1) - tail = 0; - } - - uint32_t min() { return queue[1][head]; } - - // void min_syncmer(std::vector &vec) { - // if (queue[2][head] == time - k or queue[2][head] == time - 1) { - // vec.emplace_back(queue[1][head]); - // } - // } -}; - -template struct Set { - std::set mset; - std::array::iterator, k> vec; - int i = 0; - - Set(uint32_t) { - // edge case where hash == all 1's can cause a set collision. - for (unsigned i = 0; i < k; i++) { - vec[i] = mset.emplace(i).first; - } - } - Set(const Set &other) = delete; // have to copy over iterators - Set &operator=(const Set &other) = delete; - - void insert(uint32_t index, uint32_t hash) { - mset.erase(vec[i]); - - vec[i] = mset.emplace((uint64_t)~hash << 32 | index).first; - if (++i == k) - i = 0; - } - - uint32_t min() { return *mset.rbegin(); } -}; - -} // namespace digest::ds - -const int INPUT_SIZE = 1e7; -std::array hashes; - -std::map>> all; - -void setupInput() { - std::random_device rd; // seed - std::mt19937 gen(rd()); // generator - std::uniform_int_distribution distrib(0, UINT32_MAX); // [0, 2**32] - for (uint32_t &h : hashes) { - h = distrib(gen); - } - - // edge test for ties - // for (int i = 0; i < 2*INPUT_SIZE; i++) { - // hashes[i] = hashes[0]; - // } -} - -template static void BM(benchmark::State &state) { - auto &temp = all[out][k]; - for (auto _ : state) { - T ds(k); - for (int i = 0; i < k - 1; i++) { - ds.insert(i, hashes[i]); - } - for (int i = 0; i < INPUT_SIZE; i++) { - ds.insert(i + k - 1, hashes[i + k - 1]); - temp[i] = ds.min(); - } - benchmark::ClobberMemory(); - } -} - -#define test(name, out) \ - BENCHMARK_TEMPLATE(BM, 4, name<4>, out); \ - BENCHMARK_TEMPLATE(BM, 5, name<5>, out); \ - BENCHMARK_TEMPLATE(BM, 8, name<8>, out); \ - BENCHMARK_TEMPLATE(BM, 9, name<9>, out); \ - BENCHMARK_TEMPLATE(BM, 12, name<12>, out); \ - BENCHMARK_TEMPLATE(BM, 16, name<16>, out); \ - BENCHMARK_TEMPLATE(BM, 17, name<17>, out); \ - BENCHMARK_TEMPLATE(BM, 32, name<32>, out); \ - BENCHMARK_TEMPLATE(BM, 33, name<33>, out); \ - BENCHMARK_TEMPLATE(BM, 64, name<64>, out); \ - BENCHMARK_TEMPLATE(BM, 96, name<96>, out); \ - BENCHMARK_TEMPLATE(BM, 128, name<128>, out); \ - BENCHMARK_TEMPLATE(BM, 256, name<256>, out); \ - BENCHMARK_TEMPLATE(BM, 512, name<512>, out); \ - BENCHMARK_TEMPLATE(BM, 1024, name<1024>, out); - -#define test2(name, out) \ - BENCHMARK_TEMPLATE(BM, 4, name, out); \ - BENCHMARK_TEMPLATE(BM, 5, name, out); \ - BENCHMARK_TEMPLATE(BM, 8, name, out); \ - BENCHMARK_TEMPLATE(BM, 9, name, out); \ - BENCHMARK_TEMPLATE(BM, 12, name, out); \ - BENCHMARK_TEMPLATE(BM, 16, name, out); \ - BENCHMARK_TEMPLATE(BM, 17, name, out); \ - BENCHMARK_TEMPLATE(BM, 32, name, out); \ - BENCHMARK_TEMPLATE(BM, 33, name, out); \ - BENCHMARK_TEMPLATE(BM, 64, name, out); \ - BENCHMARK_TEMPLATE(BM, 96, name, out); \ - BENCHMARK_TEMPLATE(BM, 128, name, out); \ - BENCHMARK_TEMPLATE(BM, 256, name, out); \ - BENCHMARK_TEMPLATE(BM, 512, name, out); \ - BENCHMARK_TEMPLATE(BM, 1024, name, out); - -test(digest::ds::Naive, 0) -test(digest::ds::Naive2, 1) -test(digest::ds::MonoQueue, 2) -test(digest::ds::SegmentTree, 3) -test(digest::ds::Set, 4) -test2(digest::ds::Adaptive, 5) -test2(digest::ds::Adaptive64, 6) - -int main(int argc, char **argv) { - setupInput(); - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); - - // sanity check - for (auto &[_, m] : all) { - assert(m == all.begin()->second); - } - - std::cout << "Passed Asserts!" << std::endl; -} +#include +#include + +#include +#include +#include +#include + +// segtee wins at 12 +// naive2 wins at 17 + +namespace digest::ds { + +template struct MonoQueue { + int head = 0, tail = 0; + // one extra slot so empty() works when full + // {hash, index, time} + std::array, 3> queue; + uint32_t time = 0; + + bool empty() { return head == tail; } + + MonoQueue(uint32_t) { + queue[2].fill(0); // necessary to avoid 1 in a billion collision + queue[0].fill(0); // gets rid of warning + } + MonoQueue(const MonoQueue &other) = default; + MonoQueue &operator=(const MonoQueue &other) = default; + + void insert(uint32_t index, uint32_t hash) { + if (queue[2][head] == time - k) { + if (++head == k + 1) { + head = 0; + } + } + + while (not empty() and queue[0][tail == 0 ? k : tail - 1] >= hash) { + if (--tail == -1) { + tail = k; + } + } + + queue[0][tail] = hash; + queue[1][tail] = index; + queue[2][tail] = time++; + + if (++tail == k + 1) + tail = 0; + } + + uint32_t min() { return queue[1][head]; } + + // void min_syncmer(std::vector &vec) { + // if (queue[2][head] == time - k or queue[2][head] == time - 1) { + // vec.emplace_back(queue[1][head]); + // } + // } +}; + +template struct Set { + std::set mset; + std::array::iterator, k> vec; + int i = 0; + + Set(uint32_t) { + // edge case where hash == all 1's can cause a set collision. + for (unsigned i = 0; i < k; i++) { + vec[i] = mset.emplace(i).first; + } + } + Set(const Set &other) = delete; // have to copy over iterators + Set &operator=(const Set &other) = delete; + + void insert(uint32_t index, uint32_t hash) { + mset.erase(vec[i]); + + vec[i] = mset.emplace((uint64_t)~hash << 32 | index).first; + if (++i == k) + i = 0; + } + + uint32_t min() { return *mset.rbegin(); } +}; + +} // namespace digest::ds + +const int INPUT_SIZE = 1e7; +std::array hashes; + +std::map>> all; + +void setupInput() { + std::random_device rd; // seed + std::mt19937 gen(rd()); // generator + std::uniform_int_distribution distrib(0, + UINT32_MAX); // [0, 2**32] + for (uint32_t &h : hashes) { + h = distrib(gen); + } + + // edge test for ties + // for (int i = 0; i < 2*INPUT_SIZE; i++) { + // hashes[i] = hashes[0]; + // } +} + +template static void BM(benchmark::State &state) { + auto &temp = all[out][k]; + for (auto _ : state) { + T ds(k); + for (int i = 0; i < k - 1; i++) { + ds.insert(i, hashes[i]); + } + for (int i = 0; i < INPUT_SIZE; i++) { + ds.insert(i + k - 1, hashes[i + k - 1]); + temp[i] = ds.min(); + } + benchmark::ClobberMemory(); + } +} + +#define test(name, out) \ + BENCHMARK_TEMPLATE(BM, 4, name<4>, out); \ + BENCHMARK_TEMPLATE(BM, 5, name<5>, out); \ + BENCHMARK_TEMPLATE(BM, 8, name<8>, out); \ + BENCHMARK_TEMPLATE(BM, 9, name<9>, out); \ + BENCHMARK_TEMPLATE(BM, 12, name<12>, out); \ + BENCHMARK_TEMPLATE(BM, 16, name<16>, out); \ + BENCHMARK_TEMPLATE(BM, 17, name<17>, out); \ + BENCHMARK_TEMPLATE(BM, 32, name<32>, out); \ + BENCHMARK_TEMPLATE(BM, 33, name<33>, out); \ + BENCHMARK_TEMPLATE(BM, 64, name<64>, out); \ + BENCHMARK_TEMPLATE(BM, 96, name<96>, out); \ + BENCHMARK_TEMPLATE(BM, 128, name<128>, out); \ + BENCHMARK_TEMPLATE(BM, 256, name<256>, out); \ + BENCHMARK_TEMPLATE(BM, 512, name<512>, out); \ + BENCHMARK_TEMPLATE(BM, 1024, name<1024>, out); + +#define test2(name, out) \ + BENCHMARK_TEMPLATE(BM, 4, name, out); \ + BENCHMARK_TEMPLATE(BM, 5, name, out); \ + BENCHMARK_TEMPLATE(BM, 8, name, out); \ + BENCHMARK_TEMPLATE(BM, 9, name, out); \ + BENCHMARK_TEMPLATE(BM, 12, name, out); \ + BENCHMARK_TEMPLATE(BM, 16, name, out); \ + BENCHMARK_TEMPLATE(BM, 17, name, out); \ + BENCHMARK_TEMPLATE(BM, 32, name, out); \ + BENCHMARK_TEMPLATE(BM, 33, name, out); \ + BENCHMARK_TEMPLATE(BM, 64, name, out); \ + BENCHMARK_TEMPLATE(BM, 96, name, out); \ + BENCHMARK_TEMPLATE(BM, 128, name, out); \ + BENCHMARK_TEMPLATE(BM, 256, name, out); \ + BENCHMARK_TEMPLATE(BM, 512, name, out); \ + BENCHMARK_TEMPLATE(BM, 1024, name, out); + +test(digest::ds::Naive, 0) test(digest::ds::Naive2, 1) + test(digest::ds::MonoQueue, 2) test(digest::ds::SegmentTree, 3) + test(digest::ds::Set, 4) test2(digest::ds::Adaptive, 5) + test2(digest::ds::Adaptive64, 6) + + int main(int argc, char **argv) { + setupInput(); + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + + // sanity check + for (auto &[_, m] : all) { + assert(m == all.begin()->second); + } + + std::cout << "Passed Asserts!" << std::endl; +} diff --git a/tests/density/ACTG.cpp b/tests/density/ACTG.cpp index 06b0263..626117d 100644 --- a/tests/density/ACTG.cpp +++ b/tests/density/ACTG.cpp @@ -1,113 +1,113 @@ -#include - -#include "digest/data_structure.hpp" -#include "digest/mod_minimizer.hpp" -#include "digest/syncmer.hpp" -#include "digest/window_minimizer.hpp" - -typedef long long ll; -// if you end up using long double, you need to set the floating point notation -// to fixed, and set the percision to be very high -typedef long double ld; - -#define INF 2001001001 -#define INF2 2e18 -#define MOD 1000000007 - -#define max3(a, b, c) max(a, max(b, c)) -#define min3(a, b, c) min(a, min(b, c)) -#define pb push_back -#define pf push_front -#define f first -#define s second -#define mp make_pair -#define pll pair -#define pii pair -#define tp make_tuple - -// first four are north, west, east ,south -int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; -int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; - -int main() { - - std::cout << std::fixed << std::setprecision(8); - // if you use ld, use the above and don't use string stream - - std::string str; - - std::vector strs; - assert(freopen("../tests/density/ACTG.txt", "r", stdin)); - for (int i = 0; i < 100; i++) { - std::cin >> str; - strs.pb(str); - } - - std::vector> mod_min_vec(4, std::vector()); - std::vector> wind_min_vec(4, std::vector()); - std::vector> sync_vec(4, std::vector()); - - uint64_t mods[4] = {109, 128, 1009, 1024}; - unsigned l_winds[4] = {7, 8, 17, 16}; - - double kmers = 100000 - 16 + 1; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::ModMin mm( - strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); - std::vector temp; - mm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers; - mod_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::WindowMin - wm(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - wm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers; - wind_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::Syncmer - syn(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - syn.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers; - sync_vec[i].pb(am); - } - } - assert(freopen("../tests/density/out1.txt", "w", stdout)); - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << mod_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << wind_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << sync_vec[i][j] << " "; - } - std::cout << std::endl; - } - - return 0; +#include + +#include "digest/data_structure.hpp" +#include "digest/mod_minimizer.hpp" +#include "digest/syncmer.hpp" +#include "digest/window_minimizer.hpp" + +typedef long long ll; +// if you end up using long double, you need to set the floating point notation +// to fixed, and set the percision to be very high +typedef long double ld; + +#define INF 2001001001 +#define INF2 2e18 +#define MOD 1000000007 + +#define max3(a, b, c) max(a, max(b, c)) +#define min3(a, b, c) min(a, min(b, c)) +#define pb push_back +#define pf push_front +#define f first +#define s second +#define mp make_pair +#define pll pair +#define pii pair +#define tp make_tuple + +// first four are north, west, east ,south +int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; +int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; + +int main() { + + std::cout << std::fixed << std::setprecision(8); + // if you use ld, use the above and don't use string stream + + std::string str; + + std::vector strs; + assert(freopen("../tests/density/ACTG.txt", "r", stdin)); + for (int i = 0; i < 100; i++) { + std::cin >> str; + strs.pb(str); + } + + std::vector> mod_min_vec(4, std::vector()); + std::vector> wind_min_vec(4, std::vector()); + std::vector> sync_vec(4, std::vector()); + + uint64_t mods[4] = {109, 128, 1009, 1024}; + unsigned l_winds[4] = {7, 8, 17, 16}; + + double kmers = 100000 - 16 + 1; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::ModMin mm( + strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); + std::vector temp; + mm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers; + mod_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::WindowMin + wm(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); + std::vector temp; + wm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers; + wind_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::Syncmer + syn(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); + std::vector temp; + syn.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers; + sync_vec[i].pb(am); + } + } + assert(freopen("../tests/density/out1.txt", "w", stdout)); + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << mod_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << wind_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << sync_vec[i][j] << " "; + } + std::cout << std::endl; + } + + return 0; } \ No newline at end of file diff --git a/tests/density/CMakeLists.txt b/tests/density/CMakeLists.txt index 22c7c83..ee08b92 100644 --- a/tests/density/CMakeLists.txt +++ b/tests/density/CMakeLists.txt @@ -1,25 +1,25 @@ -cmake_minimum_required(VERSION 3.5) - -project(Expected) - -set (CMAKE_CXX_STANDARD 11) - -Include(FetchContent) - -FetchContent_Declare( - Digest - GIT_REPOSITORY https://github.com/VeryAmazed/Digest - GIT_TAG 0ebc52f89c5126a6d85c71224cb06251d9d43418 -) - -FetchContent_MakeAvailable(Digest) - -add_executable(expected - non-ACTG.cpp -) - -target_link_libraries( expected - PRIVATE - digester -) - +cmake_minimum_required(VERSION 3.5) + +project(Expected) + +set (CMAKE_CXX_STANDARD 11) + +Include(FetchContent) + +FetchContent_Declare( + Digest + GIT_REPOSITORY https://github.com/VeryAmazed/Digest + GIT_TAG 0ebc52f89c5126a6d85c71224cb06251d9d43418 +) + +FetchContent_MakeAvailable(Digest) + +add_executable(expected + non-ACTG.cpp +) + +target_link_libraries( expected + PRIVATE + digester +) + diff --git a/tests/density/Results.md b/tests/density/Results.md index 02f447e..293381c 100644 --- a/tests/density/Results.md +++ b/tests/density/Results.md @@ -1,13 +1,13 @@ -I wanted to do some checks on the behavior of my program and so I decided to throw together some software to look at the density of minimizers, $\frac{number-of-minimizers}{number-of-total-kmers}$.
-# Set Up -For this series of experiments, the small window size was set to 16 for everything, and the sequence length was set to $10^5$. I wrote a program to randomly generate 100 sequences that only contained ACTG characters and another 100 sequences that could contain the character N, along with ACTG characters. So for the ACTG only sequences, there are 99985 kmers, and for the non-ACTG allowed sequences, the probability of a character being N was set to 1/33, and on average the number of kmers was between 60000 and 62000. The program I wrote does an actual count of how many k-mers are in the sequence following the same skipping rules that the Digester does.
- -For the Mod Minimizer, I ran tests using 4 different mod values, M, 2 prime and 2 not prime. I thought of X as a Bernoulli(1/M) representing the probability that a given kmer in the sequence was a minimizer.
- -For the [Window Minimizer](https://academic.oup.com/bioinformatics/article/33/14/i110/3953951) and [Syncmer](https://peerj.com/articles/10805/), apparently the expected density of minimizers is $\frac{2}{w+1}$ and $\frac{2}{w}$ respectively, where w is the number of kmers in the large window. The syncmers I implement are what the paper calls closed syncmers. The explanation for these densities, briefly and assuming hashes are i.i.d, is that for window minimizers is that when you slide the large window over 1 spot, you get a new minimzer if the the hash of the kmer that just left the window was minimal, or if the hash for the newly introduced kmer is minimal, giving $P_{m} = \frac{2}{w+1}$. For syncmers, the probability a large window is a syncmer is if either it's left or rightmost kmer have minimal hashes giving $P_{s} = \frac{2}{w}$. Viewing these as both being Bernoulli r.v.s then $E[x] = P_m$ and $E[x] = P_s$. Although the way I implemented Syncmers is a bit different from what is described in the paper because I don't break ties for Syncmers, I just check to make sure that the smallest hash value in the large window is also equal to either the hash value of the leftmost or rightmost kmer. But, since w is set to 16, and ntHash is a 64 bit hash, the probability of ties is basically zero.
-# Results -The graphs for the set of tests run on the ACTG only sequences are in the ACTG_Only_Graphs folder and the graphs for the set of tests run on sequences that contained N are in the Not_ACTG_Only_Graphs folder.
- -All graphs look normal and all 3 methods of obtaining minizmers are normal about the theoretical expected value.
- -I am not sure if the Central Limit Theorem can apply here. I think with how things are modeled, samples are drawn with replacement, but also universal hashes are only pairwise independent as opposed to completely independent, and furthermore, Window Minimizers and Syncmers most certainly are not independent as whether the current kmer is a minimizer or the smallest in the window is very much affected by what the previous minimizer was and what other values in the window are. However, I imagine they do satisfy the condition for being considered weakly dependent as if two kmers are significantly far apart, the first kmer will have no overlap with the second kmer and thus their hash values will tell you nothing about one another, additonally for Window Minimizers and Syncmers, if large windows are sufficiently far apart then know what the minimizer was for one large window tells you nothing about the other.
+I wanted to do some checks on the behavior of my program and so I decided to throw together some software to look at the density of minimizers, $\frac{number-of-minimizers}{number-of-total-kmers}$.
+# Set Up +For this series of experiments, the small window size was set to 16 for everything, and the sequence length was set to $10^5$. I wrote a program to randomly generate 100 sequences that only contained ACTG characters and another 100 sequences that could contain the character N, along with ACTG characters. So for the ACTG only sequences, there are 99985 kmers, and for the non-ACTG allowed sequences, the probability of a character being N was set to 1/33, and on average the number of kmers was between 60000 and 62000. The program I wrote does an actual count of how many k-mers are in the sequence following the same skipping rules that the Digester does.
+ +For the Mod Minimizer, I ran tests using 4 different mod values, M, 2 prime and 2 not prime. I thought of X as a Bernoulli(1/M) representing the probability that a given kmer in the sequence was a minimizer.
+ +For the [Window Minimizer](https://academic.oup.com/bioinformatics/article/33/14/i110/3953951) and [Syncmer](https://peerj.com/articles/10805/), apparently the expected density of minimizers is $\frac{2}{w+1}$ and $\frac{2}{w}$ respectively, where w is the number of kmers in the large window. The syncmers I implement are what the paper calls closed syncmers. The explanation for these densities, briefly and assuming hashes are i.i.d, is that for window minimizers is that when you slide the large window over 1 spot, you get a new minimzer if the the hash of the kmer that just left the window was minimal, or if the hash for the newly introduced kmer is minimal, giving $P_{m} = \frac{2}{w+1}$. For syncmers, the probability a large window is a syncmer is if either it's left or rightmost kmer have minimal hashes giving $P_{s} = \frac{2}{w}$. Viewing these as both being Bernoulli r.v.s then $E[x] = P_m$ and $E[x] = P_s$. Although the way I implemented Syncmers is a bit different from what is described in the paper because I don't break ties for Syncmers, I just check to make sure that the smallest hash value in the large window is also equal to either the hash value of the leftmost or rightmost kmer. But, since w is set to 16, and ntHash is a 64 bit hash, the probability of ties is basically zero.
+# Results +The graphs for the set of tests run on the ACTG only sequences are in the ACTG_Only_Graphs folder and the graphs for the set of tests run on sequences that contained N are in the Not_ACTG_Only_Graphs folder.
+ +All graphs look normal and all 3 methods of obtaining minizmers are normal about the theoretical expected value.
+ +I am not sure if the Central Limit Theorem can apply here. I think with how things are modeled, samples are drawn with replacement, but also universal hashes are only pairwise independent as opposed to completely independent, and furthermore, Window Minimizers and Syncmers most certainly are not independent as whether the current kmer is a minimizer or the smallest in the window is very much affected by what the previous minimizer was and what other values in the window are. However, I imagine they do satisfy the condition for being considered weakly dependent as if two kmers are significantly far apart, the first kmer will have no overlap with the second kmer and thus their hash values will tell you nothing about one another, additonally for Window Minimizers and Syncmers, if large windows are sufficiently far apart then know what the minimizer was for one large window tells you nothing about the other.
diff --git a/tests/density/non-ACTG.cpp b/tests/density/non-ACTG.cpp index 6cc0dcf..df255f7 100644 --- a/tests/density/non-ACTG.cpp +++ b/tests/density/non-ACTG.cpp @@ -1,158 +1,158 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "digest/mod_minimizer.hpp" -#include "digest/syncmer.hpp" -#include "digest/window_minimizer.hpp" - -typedef long long ll; -// if you end up using long double, you need to set the floating point notation -// to fixed, and set the percision to be very high -typedef long double ld; - -#define INF 2001001001 -#define INF2 2e18 -#define MOD 1000000007 - -#define max3(a, b, c) max(a, max(b, c)) -#define min3(a, b, c) min(a, min(b, c)) -#define pb push_back -#define pf push_front -#define f first -#define s second -#define mp make_pair -#define pll pair -#define pii pair -#define tp make_tuple - -// first four are north, west, east ,south -int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; -int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; - -int main() { - - std::cout << std::fixed << std::setprecision(8); - // if you use ld, use the above and don't use string stream - - std::string str; - - std::vector strs; - assert(freopen("../tests/density/non-ACTG.txt", "r", stdin)); - for (int i = 0; i < 100; i++) { - std::cin >> str; - strs.pb(str); - } - - std::vector> mod_min_vec(4, std::vector()); - std::vector> wind_min_vec(4, std::vector()); - std::vector> sync_vec(4, std::vector()); - - uint64_t mods[4] = {109, 128, 1009, 1024}; - unsigned l_winds[4] = {7, 8, 17, 16}; - - std::vector kmers(100, 0); - - for (int i = 0; i < 100; i++) { - int start = 0; - while (start + 7 < 1e5) { - bool works = true; - for (int j = 0; j < 16; j++) { - if (strs[i][start + j] == 'N') { - works = false; - start = start + j; - break; - } - } - if (works) { - kmers[i]++; - } - start++; - } - // std::cout << kmers[i] << " "; - } - // std::cout << std::endl; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::ModMin mm( - strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); - std::vector temp; - mm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers[i]; - mod_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::WindowMin - wm(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - wm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers[i]; - - wind_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::Syncmer - syn(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - syn.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers[i]; - - sync_vec[i].pb(am); - } - } - assert(freopen("../tests/density/out2.txt", "w", stdout)); - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << mod_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << wind_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << sync_vec[i][j] << " "; - } - std::cout << std::endl; - } - - return 0; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "digest/mod_minimizer.hpp" +#include "digest/syncmer.hpp" +#include "digest/window_minimizer.hpp" + +typedef long long ll; +// if you end up using long double, you need to set the floating point notation +// to fixed, and set the percision to be very high +typedef long double ld; + +#define INF 2001001001 +#define INF2 2e18 +#define MOD 1000000007 + +#define max3(a, b, c) max(a, max(b, c)) +#define min3(a, b, c) min(a, min(b, c)) +#define pb push_back +#define pf push_front +#define f first +#define s second +#define mp make_pair +#define pll pair +#define pii pair +#define tp make_tuple + +// first four are north, west, east ,south +int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; +int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; + +int main() { + + std::cout << std::fixed << std::setprecision(8); + // if you use ld, use the above and don't use string stream + + std::string str; + + std::vector strs; + assert(freopen("../tests/density/non-ACTG.txt", "r", stdin)); + for (int i = 0; i < 100; i++) { + std::cin >> str; + strs.pb(str); + } + + std::vector> mod_min_vec(4, std::vector()); + std::vector> wind_min_vec(4, std::vector()); + std::vector> sync_vec(4, std::vector()); + + uint64_t mods[4] = {109, 128, 1009, 1024}; + unsigned l_winds[4] = {7, 8, 17, 16}; + + std::vector kmers(100, 0); + + for (int i = 0; i < 100; i++) { + int start = 0; + while (start + 7 < 1e5) { + bool works = true; + for (int j = 0; j < 16; j++) { + if (strs[i][start + j] == 'N') { + works = false; + start = start + j; + break; + } + } + if (works) { + kmers[i]++; + } + start++; + } + // std::cout << kmers[i] << " "; + } + // std::cout << std::endl; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::ModMin mm( + strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); + std::vector temp; + mm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers[i]; + mod_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::WindowMin + wm(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); + std::vector temp; + wm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers[i]; + + wind_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::Syncmer + syn(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); + std::vector temp; + syn.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers[i]; + + sync_vec[i].pb(am); + } + } + assert(freopen("../tests/density/out2.txt", "w", stdout)); + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << mod_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << wind_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << sync_vec[i][j] << " "; + } + std::cout << std::endl; + } + + return 0; } \ No newline at end of file diff --git a/tests/density/plotting.py b/tests/density/plotting.py index 8da0e84..ae9209e 100644 --- a/tests/density/plotting.py +++ b/tests/density/plotting.py @@ -1,54 +1,54 @@ -import matplotlib.pyplot as plt -from matplotlib import colors -from matplotlib.ticker import PercentFormatter - -inputs = []; -for i in range(12): - temp = []; - inputs.append(temp); - -fp = open("out1.txt"); -for i in range(12): - line = fp.readline(); - inputs[i] = line.rstrip().split(); - -data = []; -for i in range(12): - temp = []; - data.append(temp); -for i in range(12): - for j in range(100): - data[i].append(float(inputs[i][j])); - -""" -for i in range(12): - for j in range(100): - - print(data[i][j], end = " ") - print(""); -""" -expected = [0.009174, 0.007812, 0.0009911, 0.0009766, 0.25, 0.2222, 0.1111, 0.1176, 0.2857, 0.25, 0.1176, 0.125] -titles = ["ModMin: Mod is 109", "ModMin: Mod is 128", "ModMin: Mod is 1009", "ModMin: Mod is 1024", "WindowMin: Large Window is 7", "WindowMin: Large Window is 8", "WindowMin: Large Window is 17", "WindowMin: Large Window is 16", "Syncmer: Large Window is 7", "Syncmer: Large Window is 8", "Syncmer: Large Window is 17", "Syncmer: Large Window is 16"] -for i in range(12): - fig, axs = plt.subplots(1, 1, figsize =(10, 7), tight_layout = True) - - axs.xaxis.set_ticks_position('none') - axs.yaxis.set_ticks_position('none') - - axs.xaxis.set_tick_params(pad = 5) - axs.yaxis.set_tick_params(pad = 10) - - N, bins, patches = axs.hist(data[i], bins = 10) - plt.axvline(expected[i], color='k', linestyle='dashed', linewidth=1) - fracs = ((N**(1 / 5)) / N.max()) - norm = colors.Normalize(fracs.min(), fracs.max()) - - for thisfrac, thispatch in zip(fracs, patches): - color = plt.cm.viridis(norm(thisfrac)) - thispatch.set_facecolor(color) - plt.xlabel("X-bar") - plt.ylabel("Count") - legend_str = " ,E[X] = " + str(expected[i]) - plt.title(titles[i] + legend_str) - plt.show() - +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter + +inputs = []; +for i in range(12): + temp = []; + inputs.append(temp); + +fp = open("out1.txt"); +for i in range(12): + line = fp.readline(); + inputs[i] = line.rstrip().split(); + +data = []; +for i in range(12): + temp = []; + data.append(temp); +for i in range(12): + for j in range(100): + data[i].append(float(inputs[i][j])); + +""" +for i in range(12): + for j in range(100): + + print(data[i][j], end = " ") + print(""); +""" +expected = [0.009174, 0.007812, 0.0009911, 0.0009766, 0.25, 0.2222, 0.1111, 0.1176, 0.2857, 0.25, 0.1176, 0.125] +titles = ["ModMin: Mod is 109", "ModMin: Mod is 128", "ModMin: Mod is 1009", "ModMin: Mod is 1024", "WindowMin: Large Window is 7", "WindowMin: Large Window is 8", "WindowMin: Large Window is 17", "WindowMin: Large Window is 16", "Syncmer: Large Window is 7", "Syncmer: Large Window is 8", "Syncmer: Large Window is 17", "Syncmer: Large Window is 16"] +for i in range(12): + fig, axs = plt.subplots(1, 1, figsize =(10, 7), tight_layout = True) + + axs.xaxis.set_ticks_position('none') + axs.yaxis.set_ticks_position('none') + + axs.xaxis.set_tick_params(pad = 5) + axs.yaxis.set_tick_params(pad = 10) + + N, bins, patches = axs.hist(data[i], bins = 10) + plt.axvline(expected[i], color='k', linestyle='dashed', linewidth=1) + fracs = ((N**(1 / 5)) / N.max()) + norm = colors.Normalize(fracs.min(), fracs.max()) + + for thisfrac, thispatch in zip(fracs, patches): + color = plt.cm.viridis(norm(thisfrac)) + thispatch.set_facecolor(color) + plt.xlabel("X-bar") + plt.ylabel("Count") + legend_str = " ,E[X] = " + str(expected[i]) + plt.title(titles[i] + legend_str) + plt.show() + diff --git a/tests/density/str_gen.cpp b/tests/density/str_gen.cpp index 4b970a3..3606e64 100644 --- a/tests/density/str_gen.cpp +++ b/tests/density/str_gen.cpp @@ -1,107 +1,107 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; -typedef long long ll; -// if you end up using long double, you need to set the floating point notation -// to fixed, and set the percision to be very high -typedef long double ld; - -// contrsuct umaps like this, unordered_map -// safe_map; FIXED_RANDOM is static so it doesn not get redeclared between -// function calls -struct custom_hash { - static uint64_t splitmix64(uint64_t x) { - // http://xorshift.di.unimi.it/splitmix64.c - x += 0x9e3779b97f4a7c15; - x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; - x = (x ^ (x >> 27)) * 0x94d049bb133111eb; - return x ^ (x >> 31); - } - - size_t operator()(uint64_t x) const { - - static const uint64_t FIXED_RANDOM = - chrono::steady_clock::now().time_since_epoch().count(); - return splitmix64(x + FIXED_RANDOM); - } -}; - -#define INF 2001001001 -#define INF2 2e18 -#define MOD 1000000007 - -#define f0r(a, b) for (long long a = 0; a < b; a++) -#define f1r(a, b, c) for (long long a = b; a < c; a++) -#define max3(a, b, c) max(a, max(b, c)) -#define min3(a, b, c) min(a, min(b, c)) -#define pb push_back -#define pf push_front -#define f first -#define s second -#define mp make_pair -#define pll pair -#define pii pair -#define tp make_tuple - -// first four are north, west, east ,south -int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; -int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; - -int main() { - - // use this if you read in from a file - - // freopen("in.txt", "r", stdin); - freopen("ACTG.txt", "w", stdout); - - char chars[4] = {'A', 'C', 'T', 'G'}; - for (int i = 0; i < 100; i++) { - string temp; - for (int j = 0; j < 1e5; j++) { - int curr = rand() % 4; - temp.pb(chars[curr]); - } - cout << temp << endl; - cout << endl; - } - freopen("non-ACTG.txt", "w", stdout); - char chars2[5] = {'A', 'C', 'T', 'G', 'N'}; - for (int i = 0; i < 100; i++) { - string temp; - for (int j = 0; j < 1e5; j++) { - int curr = rand() % 33; - if (curr == 32) { - temp.pb(chars2[4]); - } else { - curr %= 4; - temp.pb(chars2[curr]); - } - } - cout << temp << endl; - cout << endl; - } - return 0; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +typedef long long ll; +// if you end up using long double, you need to set the floating point notation +// to fixed, and set the percision to be very high +typedef long double ld; + +// contrsuct umaps like this, unordered_map +// safe_map; FIXED_RANDOM is static so it doesn not get redeclared between +// function calls +struct custom_hash { + static uint64_t splitmix64(uint64_t x) { + // http://xorshift.di.unimi.it/splitmix64.c + x += 0x9e3779b97f4a7c15; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; + x = (x ^ (x >> 27)) * 0x94d049bb133111eb; + return x ^ (x >> 31); + } + + size_t operator()(uint64_t x) const { + + static const uint64_t FIXED_RANDOM = + chrono::steady_clock::now().time_since_epoch().count(); + return splitmix64(x + FIXED_RANDOM); + } +}; + +#define INF 2001001001 +#define INF2 2e18 +#define MOD 1000000007 + +#define f0r(a, b) for (long long a = 0; a < b; a++) +#define f1r(a, b, c) for (long long a = b; a < c; a++) +#define max3(a, b, c) max(a, max(b, c)) +#define min3(a, b, c) min(a, min(b, c)) +#define pb push_back +#define pf push_front +#define f first +#define s second +#define mp make_pair +#define pll pair +#define pii pair +#define tp make_tuple + +// first four are north, west, east ,south +int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; +int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; + +int main() { + + // use this if you read in from a file + + // freopen("in.txt", "r", stdin); + freopen("ACTG.txt", "w", stdout); + + char chars[4] = {'A', 'C', 'T', 'G'}; + for (int i = 0; i < 100; i++) { + string temp; + for (int j = 0; j < 1e5; j++) { + int curr = rand() % 4; + temp.pb(chars[curr]); + } + cout << temp << endl; + cout << endl; + } + freopen("non-ACTG.txt", "w", stdout); + char chars2[5] = {'A', 'C', 'T', 'G', 'N'}; + for (int i = 0; i < 100; i++) { + string temp; + for (int j = 0; j < 1e5; j++) { + int curr = rand() % 33; + if (curr == 32) { + temp.pb(chars2[4]); + } else { + curr %= 4; + temp.pb(chars2[curr]); + } + } + cout << temp << endl; + cout << endl; + } + return 0; } \ No newline at end of file diff --git a/tests/test/rep_n_w_a.cpp b/tests/test/rep_n_w_a.cpp index 1b21945..6dad339 100644 --- a/tests/test/rep_n_w_a.cpp +++ b/tests/test/rep_n_w_a.cpp @@ -1,81 +1,81 @@ -#include -using namespace std; -typedef long long ll; -// if you end up using long double, you need to set the floating point notation -// to fixed, and set the percision to be very high -typedef long double ld; - -// contrsuct umaps like this, unordered_map -// safe_map; FIXED_RANDOM is static so it doesn not get redeclared between -// function calls -struct custom_hash { - static uint64_t splitmix64(uint64_t x) { - // http://xorshift.di.unimi.it/splitmix64.c - x += 0x9e3779b97f4a7c15; - x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; - x = (x ^ (x >> 27)) * 0x94d049bb133111eb; - return x ^ (x >> 31); - } - - size_t operator()(uint64_t x) const { - - static const uint64_t FIXED_RANDOM = - chrono::steady_clock::now().time_since_epoch().count(); - return splitmix64(x + FIXED_RANDOM); - } -}; - -#define INF 2001001001 -#define INF2 2e18 -#define MOD 1000000007 - -#define f0r(a, b) for (long long a = 0; a < b; a++) -#define f1r(a, b, c) for (long long a = b; a < c; a++) -#define max3(a, b, c) max(a, max(b, c)) -#define min3(a, b, c) min(a, min(b, c)) -#define pb push_back -#define pf push_front -#define f first -#define s second -#define mp make_pair -#define pll pair -#define pii pair -#define tp make_tuple - -// first four are north, west, east ,south -int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; -int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; - -int main() { - // apparently this does fast i/o - cin.tie(0), ios::sync_with_stdio(0); - - // use this if you read in from a file - - freopen("random.txt", "r", stdin); - freopen("random_N_to_A.txt", "w", stdout); - - stringstream ss; - - // Do it once. Do it right. - // Read the problem statement carefully - // Plan out the steps in words on a piece of paper before implementing - // after RTE(obviously) but also WA, run valgrind!!! - - // cout << fixed << setprecision(12); - // if you use ld, use the above and don't use string stream - - // use instead of ceil(a, b) if a and b are positive - // (a + b - 1) / b - string str; - cin >> str; - for (int i = 0; i < str.size(); i++) { - if (str[i] == 'N') { - str[i] = 'A'; - } - } - cout << str; - - cout << ss.str(); - return 0; -} +#include +using namespace std; +typedef long long ll; +// if you end up using long double, you need to set the floating point notation +// to fixed, and set the percision to be very high +typedef long double ld; + +// contrsuct umaps like this, unordered_map +// safe_map; FIXED_RANDOM is static so it doesn not get redeclared between +// function calls +struct custom_hash { + static uint64_t splitmix64(uint64_t x) { + // http://xorshift.di.unimi.it/splitmix64.c + x += 0x9e3779b97f4a7c15; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; + x = (x ^ (x >> 27)) * 0x94d049bb133111eb; + return x ^ (x >> 31); + } + + size_t operator()(uint64_t x) const { + + static const uint64_t FIXED_RANDOM = + chrono::steady_clock::now().time_since_epoch().count(); + return splitmix64(x + FIXED_RANDOM); + } +}; + +#define INF 2001001001 +#define INF2 2e18 +#define MOD 1000000007 + +#define f0r(a, b) for (long long a = 0; a < b; a++) +#define f1r(a, b, c) for (long long a = b; a < c; a++) +#define max3(a, b, c) max(a, max(b, c)) +#define min3(a, b, c) min(a, min(b, c)) +#define pb push_back +#define pf push_front +#define f first +#define s second +#define mp make_pair +#define pll pair +#define pii pair +#define tp make_tuple + +// first four are north, west, east ,south +int dir1[] = {1, 0, -1, 0, 1, 1, -1, -1}; +int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; + +int main() { + // apparently this does fast i/o + cin.tie(0), ios::sync_with_stdio(0); + + // use this if you read in from a file + + freopen("random.txt", "r", stdin); + freopen("random_N_to_A.txt", "w", stdout); + + stringstream ss; + + // Do it once. Do it right. + // Read the problem statement carefully + // Plan out the steps in words on a piece of paper before implementing + // after RTE(obviously) but also WA, run valgrind!!! + + // cout << fixed << setprecision(12); + // if you use ld, use the above and don't use string stream + + // use instead of ceil(a, b) if a and b are positive + // (a + b - 1) / b + string str; + cin >> str; + for (int i = 0; i < str.size(); i++) { + if (str[i] == 'N') { + str[i] = 'A'; + } + } + cout << str; + + cout << ss.str(); + return 0; +} diff --git a/tests/test/test.cpp b/tests/test/test.cpp index 15bd800..cc1c72b 100644 --- a/tests/test/test.cpp +++ b/tests/test/test.cpp @@ -1,1555 +1,1566 @@ -#include "digest/data_structure.hpp" -#include "digest/mod_minimizer.hpp" -#include "digest/syncmer.hpp" -#include "digest/window_minimizer.hpp" -#include -#include -#include -#include -#include -#include - -std::vector test_strs; -// used to be first value was 1, but now k must be >= 4 -unsigned ks[] = {4, 4, 7, 8, 9, 16, 25, 64}; - -void setupStrings() { - std::string files[] = { - "../tests/test/A.txt", - "../tests/test/a_lowercase.txt", - "../tests/test/salmonella_enterica.txt", - "../tests/test/salmonella_lowercase.txt", - "../tests/test/random.txt", - "../tests/test/random_lowercase.txt", - "../tests/test/N.txt", - "../tests/test/random_N_to_A.txt", - }; - - for (auto &file : files) { - std::ifstream ifs(file); - ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); - std::string str; - ifs >> str; - test_strs.push_back(str); - } -} - -template -void base_constructor(digest::Digester

&dig, std::string &str, unsigned k, - size_t pos, digest::MinimizedHashType minimized_h) { - INFO("String is: " << str); - INFO("K is: " << k); - INFO("Pos is: " << dig.get_pos()); - - CHECK(strcmp(str.c_str(), dig.get_sequence()) == 0); - CHECK(str.size() == dig.get_len()); - CHECK(dig.get_k() == k); - CHECK(dig.get_minimized_h() == minimized_h); - if (k <= str.size()) { - nthash::NtHash tHash(str, 1, k, pos); - CHECK(dig.get_is_valid_hash() == tHash.roll()); - if (dig.get_is_valid_hash()) { - CHECK(dig.get_pos() == tHash.get_pos()); - INFO("ntHash pos is: " << tHash.get_pos()); - CHECK(dig.get_fhash() == tHash.get_forward_hash()); - CHECK(dig.get_rhash() == tHash.get_reverse_hash()); - } - } else { - CHECK(dig.get_is_valid_hash() == false); - } -} - -template -void base_constructor_writeover(digest::Digester

&dig, std::string &str, - unsigned k, size_t pos, - digest::MinimizedHashType minimized_h) { - INFO("String is: " << str); - INFO("K is: " << k); - INFO("Pos is: " << dig.get_pos()); - - CHECK(strcmp(str.c_str(), dig.get_sequence()) == 0); - CHECK(str.size() == dig.get_len()); - CHECK(dig.get_k() == k); - CHECK(dig.get_minimized_h() == minimized_h); - if (k <= str.size()) { - nthash::NtHash tHash(test_strs[7], 1, k, pos); - CHECK(dig.get_is_valid_hash() == tHash.roll()); - if (dig.get_is_valid_hash()) { - CHECK(dig.get_pos() == tHash.get_pos()); - INFO("ntHash pos is: " << tHash.get_pos()); - CHECK(dig.get_fhash() == tHash.get_forward_hash()); - CHECK(dig.get_rhash() == tHash.get_reverse_hash()); - } - } else { - CHECK(dig.get_is_valid_hash() == false); - } -} - -template -void base_dig_comp(digest::Digester

&dig1, digest::Digester

&dig2) { - CHECK(strcmp(dig1.get_sequence(), dig2.get_sequence()) == 0); - CHECK(dig1.get_len() == dig2.get_len()); - CHECK(dig1.get_k() == dig2.get_k()); - CHECK(dig1.get_minimized_h() == dig2.get_minimized_h()); - CHECK(dig1.get_is_valid_hash() == dig2.get_is_valid_hash()); - if (dig1.get_is_valid_hash()) { - CHECK(dig1.get_chash() == dig2.get_chash()); - CHECK(dig1.get_fhash() == dig2.get_fhash()); - CHECK(dig1.get_rhash() == dig2.get_rhash()); - } -} - -template -void base_dig_roll(digest::Digester

&dig1, digest::Digester

&dig2) { - while (dig1.get_is_valid_hash()) { - dig1.roll_one(); - dig2.roll_one(); - CHECK(dig1.get_fhash() == dig2.get_fhash()); - CHECK(dig1.get_rhash() == dig2.get_rhash()); - CHECK(dig1.get_pos() == dig2.get_pos()); - } - CHECK(dig1.get_is_valid_hash() == dig2.get_is_valid_hash()); -} - -template -void ModMin_constructor(digest::ModMin

&dig, std::string &str, unsigned k, - size_t pos, digest::MinimizedHashType minimized_h, - uint64_t mod, uint64_t congruence) { - base_constructor(dig, str, k, pos, minimized_h); - CHECK(dig.get_mod() == mod); - CHECK(dig.get_congruence() == congruence); -} - -template -void ModMin_constructor_writeover(digest::ModMin

&dig, std::string &str, - unsigned k, size_t pos, - digest::MinimizedHashType minimized_h, - uint64_t mod, uint64_t congruence) { - base_constructor_writeover(dig, str, k, pos, minimized_h); - CHECK(dig.get_mod() == mod); - CHECK(dig.get_congruence() == congruence); -} - -template -void WindowMin_constructor(digest::WindowMin &dig, std::string &str, - unsigned k, unsigned large_wind_kmer_am, size_t pos, - digest::MinimizedHashType minimized_h) { - base_constructor(dig, str, k, pos, minimized_h); - CHECK(dig.get_large_wind_kmer_am() == large_wind_kmer_am); - // CHECK(dig.get_st_index() == 0); - CHECK(dig.get_ds_size() == 0); - CHECK(dig.get_is_minimized() == false); -} - -template -void ModMin_dig_comp(digest::ModMin

&dig1, digest::ModMin

&dig2) { - base_dig_comp(dig1, dig2); - CHECK(dig1.get_mod() == dig2.get_mod()); - CHECK(dig1.get_congruence() == dig2.get_congruence()); - base_dig_roll(dig1, dig2); -} - -template -void WindowMin_roll_minimizers_comp(digest::WindowMin &dig1, - digest::WindowMin &dig2) { - std::vector vec1; - std::vector vec2; - dig1.roll_minimizer(1000, vec1); - dig2.roll_minimizer(1000, vec2); - REQUIRE(vec1.size() == vec2.size()); - for (size_t i = 0; i < vec1.size(); i++) { - CHECK(vec1[i] == vec2[i]); - } -} - -template -void Syncmer_roll_minimizers_comp(digest::Syncmer &dig1, - digest::Syncmer &dig2) { - std::vector vec1; - std::vector vec2; - dig1.roll_minimizer(1000, vec1); - dig2.roll_minimizer(1000, vec2); - REQUIRE(vec1.size() == vec2.size()); - for (size_t i = 0; i < vec1.size(); i++) { - CHECK(vec1[i] == vec2[i]); - } -} - -template -void WindowMin_dig_comp(digest::WindowMin &dig1, - digest::WindowMin &dig2) { - base_dig_comp(dig1, dig2); - CHECK(dig1.get_large_wind_kmer_am() == dig2.get_large_wind_kmer_am()); - CHECK(dig1.get_ds_size() == dig2.get_ds_size()); - CHECK(dig1.get_is_minimized() == dig2.get_is_minimized()); - // need to use this because I need to check, or at least get some indication, - // of whether the two seg trees are the same - WindowMin_roll_minimizers_comp(dig1, dig2); -} - -template -void Syncmer_dig_comp(digest::Syncmer &dig1, - digest::Syncmer &dig2) { - base_dig_comp(dig1, dig2); - CHECK(dig1.get_large_wind_kmer_am() == dig2.get_large_wind_kmer_am()); - CHECK(dig1.get_ds_size() == dig2.get_ds_size()); - CHECK(dig1.get_is_minimized() == dig2.get_is_minimized()); - // need to use this because I need to check, or at least get some indication, - // of whether the two seg trees are the same - Syncmer_roll_minimizers_comp(dig1, dig2); -} - -template -void roll_one(digest::Digester

&dig, std::string &str, unsigned k) { - INFO(str); - INFO(k); - nthash::NtHash tHash(str, 1, k, 0); - uint64_t true_fhash; - uint64_t true_rhash; - uint64_t dig_fhash; - uint64_t dig_rhash; - bool worked = tHash.roll(); - while ((worked = tHash.roll())) { - dig.roll_one(); - CHECK(dig.get_is_valid_hash() == worked); - if (worked) { - CHECK(dig.get_pos() == tHash.get_pos()); - true_fhash = tHash.get_forward_hash(); - true_rhash = tHash.get_reverse_hash(); - dig_fhash = dig.get_fhash(); - dig_rhash = dig.get_rhash(); - CHECK(dig_fhash == true_fhash); - CHECK(dig_rhash == true_rhash); - } - } - dig.roll_one(); - CHECK(dig.get_is_valid_hash() == worked); -} - -template -void roll_one_write_over(digest::Digester

&dig, std::string &str, - unsigned k) { - INFO(str); - INFO(k); - nthash::NtHash tHash(test_strs[7], 1, k, 0); - uint64_t true_fhash; - uint64_t true_rhash; - uint64_t dig_fhash; - uint64_t dig_rhash; - bool worked = tHash.roll(); - while ((worked = tHash.roll())) { - dig.roll_one(); - CHECK(dig.get_is_valid_hash() == worked); - if (worked) { - CHECK(dig.get_pos() == tHash.get_pos()); - true_fhash = tHash.get_forward_hash(); - true_rhash = tHash.get_reverse_hash(); - dig_fhash = dig.get_fhash(); - dig_rhash = dig.get_rhash(); - CHECK(dig_fhash == true_fhash); - CHECK(dig_rhash == true_rhash); - } - } - dig.roll_one(); - CHECK(dig.get_is_valid_hash() == worked); -} - -template -void ModMin_roll_minimizer(digest::ModMin

&dig, std::string &str, unsigned k, - digest::MinimizedHashType minimized_h, - uint32_t prime) { - nthash::NtHash tHash(str, 1, k, 0); - std::vector positions; - std::vector hashes; - while (tHash.roll()) { - uint32_t temp; - if (minimized_h == digest::MinimizedHashType::CANON) { - temp = *(tHash.hashes()); - } else if (minimized_h == digest::MinimizedHashType::FORWARD) { - temp = tHash.get_forward_hash(); - } else { - temp = tHash.get_reverse_hash(); - } - if (temp % prime == 0) { - positions.push_back(tHash.get_pos()); - hashes.push_back(temp); - } - } - digest::ModMin dig2 = dig; - - std::vector dig_positions; - dig.roll_minimizer(400, dig_positions); - REQUIRE(positions.size() == dig_positions.size()); - for (size_t i = 0; i < positions.size(); i++) { - CHECK(dig_positions[i] == positions[i]); - } - - std::vector> dig_positions2; - dig2.roll_minimizer(400, dig_positions2); - REQUIRE(positions.size() == dig_positions2.size()); - for (size_t i = 0; i < positions.size(); i++) { - CHECK(dig_positions2[i].first == positions[i]); - CHECK(dig_positions2[i].second == hashes[i]); - } -} - -template -void WindowMin_roll_minimizer(digest::WindowMin &dig, std::string &str, - unsigned k, unsigned large_wind_kmer_am, - digest::MinimizedHashType minimized_h) { - nthash::NtHash tHash(str, 1, k, 0); - std::vector> hashes; - while (tHash.roll()) { - uint32_t temp; - if (minimized_h == digest::MinimizedHashType::CANON) { - temp = *(tHash.hashes()); - } else if (minimized_h == digest::MinimizedHashType::FORWARD) { - temp = tHash.get_forward_hash(); - } else { - temp = tHash.get_reverse_hash(); - } - hashes.push_back(std::make_pair(temp, tHash.get_pos())); - } - - std::vector> answers; - std::pair prev; - for (size_t i = 0; i + large_wind_kmer_am <= hashes.size(); i++) { - std::pair temp_pair = hashes[i]; - for (uint j = 1; j < large_wind_kmer_am; j++) { - std::pair curr = hashes[i + j]; - if (curr.first < temp_pair.first) { - temp_pair = curr; - } else if (curr.first == temp_pair.first) { - if (curr.second > temp_pair.second) { - temp_pair = curr; - } - } - } - if (i == 0) { - prev = temp_pair; - answers.push_back(temp_pair); - } else { - if (prev != temp_pair) { - prev = temp_pair; - answers.push_back(temp_pair); - } - } - } - - digest::WindowMin dig2 = dig; - - std::vector wind_mins; - dig.roll_minimizer(1000, wind_mins); - REQUIRE(answers.size() == wind_mins.size()); - for (size_t i = 0; i < answers.size(); i++) { - CHECK(wind_mins[i] == answers[i].second); - } - - std::vector> wind_mins2; - dig2.roll_minimizer(1000, wind_mins2); - REQUIRE(answers.size() == wind_mins2.size()); - for (size_t i = 0; i < answers.size(); i++) { - CHECK(wind_mins2[i].second == answers[i].first); - CHECK(wind_mins2[i].first == answers[i].second); - } -} - -template -void Syncmer_roll_minimizer(digest::Syncmer &dig, std::string &str, - unsigned k, unsigned large_wind_kmer_am, - digest::MinimizedHashType minimized_h) { - nthash::NtHash tHash(str, 1, k, 0); - std::vector> hashes; - while (tHash.roll()) { - uint32_t temp; - if (minimized_h == digest::MinimizedHashType::CANON) { - temp = *(tHash.hashes()); - } else if (minimized_h == digest::MinimizedHashType::FORWARD) { - temp = tHash.get_forward_hash(); - } else { - temp = tHash.get_reverse_hash(); - } - hashes.push_back(std::make_pair(temp, tHash.get_pos())); - } - - std::vector> answers; - for (size_t i = 0; i + large_wind_kmer_am <= hashes.size(); i++) { - uint32_t minAm = hashes[i].first; - - for (uint j = 1; j < large_wind_kmer_am; j++) { - minAm = std::min(minAm, hashes[i + j].first); - } - - if (minAm == hashes[i].first || - minAm == hashes[i + large_wind_kmer_am - 1].first) { - answers.emplace_back(hashes[i].second, minAm); - } - } - - digest::Syncmer dig2 = dig; - - std::vector syncs; - dig.roll_minimizer(1000, syncs); - - assert(answers.size() == syncs.size()); - REQUIRE(answers.size() == syncs.size()); - for (size_t i = 0; i < answers.size(); i++) { - CHECK(syncs[i] == answers[i].first); - } - - std::vector> syncs2; - dig2.roll_minimizer(1000, syncs2); - REQUIRE(answers.size() == syncs2.size()); - for (size_t i = 0; i < answers.size(); i++) { - CHECK(syncs2[i].first == answers[i].first); - CHECK(syncs2[i].second == answers[i].second); - } -} - -template -void append_seq_compare(std::string &str1, std::string &str2, - digest::Digester

&dig, unsigned k) { - INFO(str1); - INFO(str2); - INFO(str1.size()); - INFO(str2.size()); - INFO(k); - - std::string str3 = str1 + str2; - nthash::NtHash tHash(str3, 1, k); - std::vector vec1; - std::vector positions1; - while (tHash.roll()) { - vec1.push_back(*(tHash.hashes())); - positions1.push_back(tHash.get_pos()); - } - std::vector vec2; - std::vector positions2; - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - dig.append_seq(str2); - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - REQUIRE(vec1.size() == vec2.size()); - for (size_t i = 0; i < vec1.size(); i++) { - INFO(i); - CHECK(vec1[i] == vec2[i]); - CHECK(positions1[i] == positions2[i]); - } -} - -template -void append_seq_compare3(std::string &str1, std::string &str2, std::string str3, - digest::Digester

&dig, unsigned k) { - INFO(str1); - INFO(str2); - INFO(str3); - INFO(k); - // Make sure to check positions too - std::string str4 = str1 + str2 + str3; - nthash::NtHash tHash(str4, 1, k); - std::vector vec1; - std::vector positions1; - while (tHash.roll()) { - vec1.push_back(*(tHash.hashes())); - positions1.push_back(tHash.get_pos()); - } - std::vector vec2; - std::vector positions2; - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - dig.append_seq(str2); - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - dig.append_seq(str3); - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - REQUIRE(vec1.size() == vec2.size()); - for (size_t i = 0; i < vec1.size(); i++) { - INFO(i); - CHECK(vec1[i] == vec2[i]); - CHECK(positions1[i] == positions2[i]); - } -} - -void append_seq_small_cases() { - std::string str1 = "CCGTGT"; - std::string str2 = "CCGNGT"; - std::string str3 = "AGCCTT"; - std::string str4 = "ANCCTT"; - std::string str5 = "A"; - - digest::Digester *dig = - new digest::ModMin( - str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare(str1, str3, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare(str2, str4, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare(str2, str3, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare(str2, str5, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare(str1, str5, *dig, 4); - delete dig; -} - -void append_seq_small_cases2() { - std::string str1_good = "CATACCGGT"; - std::string str1_short = "TAG"; - std::string str1_badCh = "CATACNCGGT"; - - std::string str2_good = "GTTCTCGCTT"; - std::string str2_badCh = "GTNTCTCGCTT"; - std::string str2A = "A"; - std::string str2_short = "TGGA"; - - std::string str3_good = "CAACGACCGC"; - std::string str3_badCh = "NCAACGACCGC"; - - digest::Digester *dig = - new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_good, str2_good, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_good, str2_badCh, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_good, str2A, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_short, str2A, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_badCh, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_badCh, str2A, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_good, str2_short, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3(str1_short, str2A, str3_badCh, *dig, 6); - delete dig; -} - -template -void append_seq_compare_write_over(std::string &str1, std::string &str2, - digest::Digester

&dig, unsigned k) { - INFO(str1); - INFO(str2); - INFO(str1.size()); - INFO(str2.size()); - INFO(k); - - std::string str3 = str1 + str2; - for (int i = 0; i < (int)str3.size(); i++) { - if (str3[i] == 'N' || str3[i] == 'n') { - str3[i] = 'A'; - } - } - nthash::NtHash tHash(str3, 1, k); - std::vector vec1; - std::vector positions1; - while (tHash.roll()) { - vec1.push_back(*(tHash.hashes())); - positions1.push_back(tHash.get_pos()); - } - std::vector vec2; - std::vector positions2; - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - dig.append_seq(str2); - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - REQUIRE(vec1.size() == vec2.size()); - for (size_t i = 0; i < vec1.size(); i++) { - INFO(i); - CHECK(vec1[i] == vec2[i]); - CHECK(positions1[i] == positions2[i]); - } -} - -template -void append_seq_compare3_write_over(std::string &str1, std::string &str2, - std::string str3, digest::Digester

&dig, - unsigned k) { - INFO(str1); - INFO(str2); - INFO(str3); - INFO(k); - // Make sure to check positions too - std::string str4 = str1 + str2 + str3; - for (int i = 0; i < (int)str4.size(); i++) { - if (str4[i] == 'N' || str4[i] == 'n') { - str4[i] = 'A'; - } - } - nthash::NtHash tHash(str4, 1, k); - std::vector vec1; - std::vector positions1; - while (tHash.roll()) { - vec1.push_back(*(tHash.hashes())); - positions1.push_back(tHash.get_pos()); - } - std::vector vec2; - std::vector positions2; - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - dig.append_seq(str2); - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - dig.append_seq(str3); - if (dig.get_is_valid_hash()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - while (dig.roll_one()) { - vec2.push_back(dig.get_chash()); - positions2.push_back(dig.get_pos()); - } - } - REQUIRE(vec1.size() == vec2.size()); - for (size_t i = 0; i < vec1.size(); i++) { - INFO(i); - CHECK(vec1[i] == vec2[i]); - CHECK(positions1[i] == positions2[i]); - } -} - -void append_seq_small_cases_write_over() { - std::string str1 = "CCGTGT"; - std::string str2 = "CCGNGT"; - std::string str3 = "AGCCTT"; - std::string str4 = "ANCCTT"; - std::string str5 = "A"; - - digest::Digester *dig = - new digest::ModMin( - str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare_write_over(str1, str3, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare_write_over(str2, str4, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare_write_over(str2, str3, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare_write_over(str2, str5, *dig, 4); - delete dig; - - dig = new digest::ModMin( - str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare_write_over(str1, str5, *dig, 4); - delete dig; -} - -void append_seq_small_cases2_write_over() { - std::string str1_good = "CATACCGGT"; - std::string str1_short = "TAG"; - std::string str1_badCh = "CATACNCGGT"; - - std::string str2_good = "GTTCTCGCTT"; - std::string str2_badCh = "GTNTCTCGCTT"; - std::string str2A = "A"; - std::string str2_short = "TGGA"; - - std::string str3_good = "CAACGACCGC"; - std::string str3_badCh = "NCAACGACCGC"; - - digest::Digester *dig = - new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_good, str2_good, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_good, str2_badCh, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_good, str2A, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_short, str2A, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_badCh, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_badCh, str2A, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_good, str2_short, str3_good, *dig, 6); - delete dig; - - dig = new digest::ModMin( - str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); - append_seq_compare3_write_over(str1_short, str2A, str3_badCh, *dig, 6); - delete dig; -} -/* - consider re-organizing this so this only tests the UM_Digester specific - stuff like the constructor and roll_minimizer, but put the more general - stuff, append_seq and roll_one in the general testing group -*/ - -TEST_CASE("Digester Testing") { - setupStrings(); - // These use the ModMinimizer Class because Digester can't be instantiated, - // but correctness doesn't depend on any of the fields of functions in the - // ModMinimizer Class - SECTION("Base Constructor Special Cases") { - unsigned k; - digest::MinimizedHashType minimized_h; - uint32_t mod, congruence; - size_t pos; - std::string str; - // string is length 1, k = 1 - str = "AAAA"; - k = 4; - pos = 0; - for (int i = 0; i < 3; i++) { - minimized_h = static_cast(i); - mod = 2; - congruence = 1; - - digest::ModMin *dig = - new digest::ModMin( - str, k, mod, congruence, pos, minimized_h); - ModMin_constructor(*dig, str, k, pos, minimized_h, mod, congruence); - delete dig; - } - - // string is length 1, k = 4 - str = "A"; - k = ks[1]; - pos = 0; - for (int i = 0; i < 3; i++) { - minimized_h = static_cast(i); - mod = 2; - congruence = 1; - - digest::ModMin *dig = - new digest::ModMin( - str, k, mod, congruence, pos, minimized_h); - ModMin_constructor(*dig, str, k, pos, minimized_h, mod, congruence); - delete dig; - } - - for (uint i = 0; i < test_strs.size(); i++) { - for (int j = 0; j < 8; j++) { - k = ks[j]; - for (int l = 0; l < 16; l++) { - pos = l; - for (int p = 0; p < 3; p++) { - minimized_h = static_cast(p); - mod = 1e9 + 7; - congruence = 0; - - digest::ModMin *dig = - new digest::ModMin( - test_strs[i], k, mod, congruence, pos, minimized_h); - ModMin_constructor(*dig, test_strs[i], k, pos, minimized_h, mod, - congruence); - delete dig; - } - } - } - } - - // test writeover policy - for (int j = 0; j < 8; j++) { - k = ks[j]; - for (int l = 0; l < 16; l++) { - pos = l; - for (int p = 0; p < 3; p++) { - minimized_h = static_cast(p); - mod = 1e9 + 7; - congruence = 0; - - digest::ModMin *dig = - new digest::ModMin( - test_strs[4], k, mod, congruence, pos, minimized_h); - ModMin_constructor_writeover(*dig, test_strs[4], k, pos, minimized_h, - mod, congruence); - delete dig; - } - } - } - - // Throwing Exceptions - // Shouldn't/Doesn't leak any memory - // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c - - str = "ACTGACTG"; - k = 4; - pos = 0; - minimized_h = digest::MinimizedHashType::CANON; - mod = 1e9 + 7; - congruence = 0; - - k = 0; - digest::ModMin *dig; - CHECK_THROWS_AS(dig = new digest::ModMin( - str, k, mod, congruence, pos, minimized_h), - digest::BadConstructionException); - k = 4; - - // pos >= seq.size() - pos = 8; - CHECK_THROWS_AS(dig = new digest::ModMin( - str, k, mod, congruence, pos, minimized_h), - digest::BadConstructionException); - pos = 0; - - // minimized_h > 2 - minimized_h = (digest::MinimizedHashType)3; - CHECK_THROWS_AS(dig = new digest::ModMin( - str, k, mod, congruence, pos, minimized_h), - digest::BadConstructionException); - minimized_h = (digest::MinimizedHashType)0; - } - - SECTION("Testing roll_one") { - for (int i = 0; i < 7; i++) { - for (int j = 0; j < 8; j++) { - digest::ModMin *dig = - new digest::ModMin( - test_strs[i], ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - roll_one(*dig, test_strs[i], ks[j]); - delete dig; - } - } - - // testing roll_one for writeover - for (int j = 0; j < 8; j++) { - digest::ModMin *dig = - new digest::ModMin( - test_strs[4], ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - roll_one_write_over(*dig, test_strs[4], ks[j]); - delete dig; - } - } - - SECTION("Testing append_seq()") { - append_seq_small_cases(); - // Throws NotRolledTillEndException() - digest::ModMin *dig = - new digest::ModMin(test_strs[0], 4, - 17); - CHECK_THROWS_AS(dig->append_seq(test_strs[0]), - digest::NotRolledTillEndException); - delete dig; - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - std::string str1 = test_strs[i].substr(0, l); - std::string str2 = test_strs[i].substr(l, 100); - digest::ModMin *dig = - new digest::ModMin( - str1, ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - append_seq_compare(str1, str2, *dig, ks[j]); - delete dig; - } - } - } - append_seq_small_cases2(); - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - for (int r = 12; r < 85; r += 24) { - std::string str1 = test_strs[i].substr(0, l); - std::string str2 = test_strs[i].substr(l, r); - std::string str3 = test_strs[i].substr(l + r, 75); - digest::ModMin *dig = - new digest::ModMin( - str1, ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - append_seq_compare3(str1, str2, str3, *dig, ks[j]); - delete dig; - } - } - } - } - - // testing append_seq for writeover - append_seq_small_cases_write_over(); - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - std::string str1 = test_strs[4].substr(0, l); - std::string str2 = test_strs[4].substr(l, 100); - digest::ModMin *dig = - new digest::ModMin( - str1, ks[j], 1e9 + 7, 0, 0, digest::MinimizedHashType::FORWARD); - append_seq_compare_write_over(str1, str2, *dig, ks[j]); - delete dig; - } - } - - append_seq_small_cases2_write_over(); - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - for (int r = 12; r < 85; r += 24) { - std::string str1 = test_strs[4].substr(0, l); - std::string str2 = test_strs[4].substr(l, r); - std::string str3 = test_strs[4].substr(l + r, 75); - digest::ModMin *dig = - new digest::ModMin( - str1, ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - append_seq_compare3_write_over(str1, str2, str3, *dig, ks[j]); - delete dig; - } - } - } - } - - SECTION("Testing new_seq()") { - unsigned k; - std::string str; - // string is length 1, k = 4 - str = "A"; - k = ks[1]; - - digest::ModMin *dig1 = - new digest::ModMin( - test_strs[0], k, 1e9 + 7, 0, 0, digest::MinimizedHashType::CANON); - dig1->new_seq(str, 0); - base_constructor(*dig1, str, k, 0, digest::MinimizedHashType::CANON); - delete dig1; - - // Throw BadConstructionException() - dig1 = new digest::ModMin( - test_strs[0], k, 1e9 + 7, 0, 0, digest::MinimizedHashType::CANON); - CHECK_THROWS_AS(dig1->new_seq(test_strs[0], 500), - digest::BadConstructionException); - delete dig1; - - for (uint i = 0; i < test_strs.size(); i += 2) { - for (int j = 0; j < 32; j += 8) { - digest::ModMin *dig = - new digest::ModMin( - test_strs[1], ks[3], 1e9 + 7, 0, 0, - digest::MinimizedHashType::CANON); - dig->new_seq(test_strs[i], j); - base_constructor(*dig, test_strs[i], ks[3], j, - digest::MinimizedHashType::CANON); - delete dig; - } - } - - for (uint i = 0; i < test_strs.size(); i += 2) { - for (int l = 13; l <= 78; l += 13) { - digest::ModMin *dig = - new digest::ModMin( - test_strs[5], ks[3], 1e9 + 7, 0, 0, - digest::MinimizedHashType::CANON); - int ind = 0; - while (ind < l && dig->roll_one()) { - ind++; - } - dig->new_seq(test_strs[i], 0); - base_constructor(*dig, test_strs[i], ks[3], 0, - digest::MinimizedHashType::CANON); - delete dig; - } - } - - // new_seq when deque has stuff in it - dig1 = new digest::ModMin( - test_strs[2], 8, 17, 0, 0, digest::MinimizedHashType::CANON); - std::vector vec; - dig1->roll_minimizer(1000, vec); - vec.clear(); - dig1->append_seq(test_strs[2]); - dig1->roll_minimizer(1000, vec); - vec.clear(); - dig1->new_seq(test_strs[4], 0); - base_constructor(*dig1, test_strs[4], 8, 0, - digest::MinimizedHashType::CANON); - delete dig1; - - // new_seq when deque has stuff in it and a new hash can't be properly - // initialized - std::string bad_str = "TTACTNGTACCTG"; - dig1 = new digest::ModMin( - test_strs[2], 8, 17, 0, 0, digest::MinimizedHashType::CANON); - dig1->roll_minimizer(1000, vec); - vec.clear(); - dig1->append_seq(test_strs[2]); - dig1->roll_minimizer(1000, vec); - vec.clear(); - dig1->new_seq(bad_str, 0); - base_constructor(*dig1, bad_str, 8, 0, digest::MinimizedHashType::CANON); - delete dig1; - } -} - -TEST_CASE("ModMin Testing") { - setupStrings(); - - SECTION("Testing Constructors") { - unsigned k; - digest::MinimizedHashType minimized_h; - uint32_t mod, congruence; - size_t pos; - std::string str; - - // Throwing Exceptions - // Shouldn't/Doesn't leak any memory - // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c - - str = "ACTGACTG"; - k = 4; - pos = 0; - minimized_h = digest::MinimizedHashType::CANON; - digest::ModMin *dig; - - // mod >= congruence - mod = 2; - congruence = 2; - CHECK_THROWS_AS(dig = new digest::ModMin( - str, k, mod, congruence, pos, minimized_h), - digest::BadModException); - } - - // maybe move this into an entirely new test case, and make this big thing - // just tests for the Dig class - SECTION("Testing roll_minimizer(). The one that takes no parameters") { - uint32_t prime = 17; - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 0; l < 3; l++) { - digest::ModMin *dig = - new digest::ModMin( - test_strs[i], ks[j], prime, 0, 0, - static_cast(l)); - ModMin_roll_minimizer(*dig, test_strs[i], ks[j], - static_cast(l), - prime); - delete dig; - } - } - } - } - - SECTION("Testing Copy Constructor") { - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - digest::ModMin *dig1 = - new digest::ModMin( - test_strs[i], ks[j], 1e9 + 7, 0, l, - digest::MinimizedHashType::FORWARD); - digest::ModMin *dig2 = - new digest::ModMin(*dig1); - ModMin_dig_comp(*dig1, *dig2); - delete dig1; - delete dig2; - } - } - } - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - std::string str1 = test_strs[i].substr(0, l); - std::string str2 = test_strs[i].substr(l, 100); - digest::ModMin *dig1 = - new digest::ModMin( - str1, ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - std::vector vec; - dig1->roll_minimizer(1000, vec); - dig1->append_seq(str2); - digest::ModMin *dig2 = - new digest::ModMin(*dig1); - ModMin_dig_comp(*dig1, *dig2); - delete dig1; - delete dig2; - } - } - } - } - - SECTION("Testing Assignment Operator") { - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - digest::ModMin *dig1 = - new digest::ModMin( - test_strs[i], ks[j], 1e9 + 7, 0, l, - digest::MinimizedHashType::FORWARD); - digest::ModMin *dig2 = - new digest::ModMin( - test_strs[1], 99, 98765, 3, 0, - digest::MinimizedHashType::REVERSE); - *dig2 = *dig1; - ModMin_dig_comp(*dig1, *dig2); - delete dig1; - delete dig2; - } - } - } - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { - std::string str1 = test_strs[i].substr(0, l); - std::string str2 = test_strs[i].substr(l, 100); - digest::ModMin *dig1 = - new digest::ModMin( - str1, ks[j], 1e9 + 7, 0, 0, - digest::MinimizedHashType::FORWARD); - std::vector vec; - dig1->roll_minimizer(1000, vec); - dig1->append_seq(str2); - digest::ModMin *dig2 = - new digest::ModMin( - test_strs[1], 99, 98765, 3, 0, - digest::MinimizedHashType::REVERSE); - *dig2 = *dig1; - ModMin_dig_comp(*dig1, *dig2); - delete dig1; - delete dig2; - } - } - } - } -} - -#define do64(F) \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<4>, 4) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<31>, 31) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<32>, 32) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<33>, 33) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<63>, 63) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<64>, 64) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<4>, 4) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<31>, 31) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<32>, 32) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<33>, 33) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<63>, 63) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<64>, 64) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<4>, 4) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<31>, 31) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<32>, 32) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<33>, 33) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<63>, 63) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<64>, 64) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 4) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 31) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 32) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 33) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 63) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 64) } - -TEST_CASE("WindowMin Testing") { - SECTION("Constructor Testing") { - unsigned k; - digest::MinimizedHashType minimized_h; - size_t pos; - std::string str; - - // Throwing Exceptions - // Shouldn't/Doesn't leak any memory - // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c - - str = "ACTGACTG"; - k = 4; - pos = 0; - minimized_h = digest::MinimizedHashType::CANON; - /* - #define WC(T) \ - digest::WindowMin* dig1; \ - CHECK_THROWS_AS((dig1 = new digest::WindowMin(str, k, 0, pos, - minimized_h)), digest::BadWindowSizeException); - - WC(data_structure::SegmentTree<0>); - */ - - for (uint i = 0; i < test_strs.size(); i++) { - k = 4; - pos = 0; - minimized_h = digest::MinimizedHashType::CANON; - -#define TEST_CONSTRUCTOR_0(P, T, j) \ - digest::WindowMin *dig = \ - new digest::WindowMin(test_strs[i], k, j, pos, minimized_h); \ - WindowMin_constructor(*dig, test_strs[i], k, j, pos, minimized_h); \ - delete dig; - - do64(TEST_CONSTRUCTOR_0) - } - } - - SECTION("roll_minimizer() testing") { - for (int i = 0; i < 7; i += 2) { - // std::cout << test_strs[i] << std::endl; - for (int j = 0; j < 8; j++) { - for (int l = 0; l < 3; l++) { -#define TEST_ROLL_0(p, m, k) \ - digest::WindowMin *dig = new digest::WindowMin( \ - test_strs[i], ks[j], k, 0, static_cast(l)); \ - WindowMin_roll_minimizer(*dig, test_strs[i], ks[j], k, \ - static_cast(l)); \ - delete dig; - - do64(TEST_ROLL_0) - } - } - } - } - /* - the below also inadverntently tests how append_seq (only the case that - there are 2 sequences involved total) works with roll_minimizer for - WindowMin. In theory this shouldn't be needed and also can't be considered - "thorough", but it is extra assurance. - */ - SECTION("Testing Copy Constructor") { - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_COPY_0(P, T, k) \ - digest::WindowMin dig1(test_strs[i], ks[j], k, l, \ - digest::MinimizedHashType::FORWARD); \ - digest::WindowMin dig2(dig1); \ - WindowMin_dig_comp(dig1, dig2); - - do64(TEST_COPY_0) - } - } - } - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_COPY_1(P, T, k) \ - std::string str1 = test_strs[i].substr(0, l); \ - std::string str2 = test_strs[i].substr(l, 100); \ - digest::WindowMin dig1(str1, ks[j], k, 0, \ - digest::MinimizedHashType::FORWARD); \ - std::vector vec; \ - dig1.roll_minimizer(1000, vec); \ - dig1.append_seq(str2); \ - digest::WindowMin dig2(dig1); \ - WindowMin_dig_comp(dig1, dig2); - - do64(TEST_COPY_1) - } - } - } - } - - SECTION("Testing Assignment Operator") { - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_COPY_2(P, T, k) \ - digest::WindowMin dig1(test_strs[i], ks[j], k, l, \ - digest::MinimizedHashType::FORWARD); \ - digest::WindowMin dig2(test_strs[1], 99, k, 0, \ - digest::MinimizedHashType::REVERSE); \ - dig2 = dig1; \ - WindowMin_dig_comp(dig1, dig2); - - do64(TEST_COPY_2) - } - } - } - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_COPY_3(P, T, m) \ - std::string str1 = test_strs[i].substr(0, l); \ - std::string str2 = test_strs[i].substr(l, 100); \ - digest::WindowMin dig1(str1, ks[j], m, 0, \ - digest::MinimizedHashType::FORWARD); \ - std::vector vec; \ - dig1.roll_minimizer(1000, vec); \ - dig1.append_seq(str2); \ - digest::WindowMin dig2(test_strs[1], 35, m, 0, \ - digest::MinimizedHashType::REVERSE); \ - dig2 = dig1; \ - WindowMin_dig_comp(dig1, dig2); - - do64(TEST_COPY_3) - } - } - } - } -} - -TEST_CASE("Syncmer Testing") { - // Syncmer and WindowMinimizers have all the same class members so I can just - // use the WindowMin tests for Constructor and be ok - SECTION("Constructor Testing") { - - unsigned k; - digest::MinimizedHashType minimized_h; - size_t pos; - std::string str; - - // Throwing Exceptions - // Shouldn't/Doesn't leak any memory - // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c - - str = "ACTGACTG"; - k = 4; - pos = 0; - minimized_h = digest::MinimizedHashType::CANON; - // digest::Syncmer>* dig1; - // CHECK_THROWS_AS((dig1 = new - // digest::Syncmer>(str, k, - // 0, pos, minimized_h)), digest::BadWindowSizeException); - - for (uint i = 0; i < test_strs.size(); i++) { -#define TEST_SYNCON(P, T, j) \ - k = 4; \ - pos = 0; \ - minimized_h = digest::MinimizedHashType::CANON; \ - \ - digest::Syncmer dig(test_strs[i], k, j, pos, minimized_h); \ - WindowMin_constructor(dig, test_strs[i], k, j, pos, minimized_h); - - do64(TEST_SYNCON) - } - } - - SECTION("roll_minimizer() testing") { - for (int i = 0; i < 7; i += 2) { - // std::cout << test_strs[i] << std::endl; - for (int j = 0; j < 8; j++) { - for (int l = 0; l < 3; l++) { -#define TEST_SYNCROLL(P, T, m) \ - digest::Syncmer dig(test_strs[i], ks[j], m, 0, \ - static_cast(l)); \ - Syncmer_roll_minimizer(dig, test_strs[i], ks[j], m, \ - static_cast(l)); - - do64(TEST_SYNCROLL) - } - } - } - } - - /* - the below also inadverntently tests how append_seq (only the case that - there are 2 sequences involved total) works with roll_minimizer for - WindowMin. In theory this shouldn't be needed and also can't be considered - "thorough", but it is extra assurance. - */ - SECTION("Testing Copy Constructor") { - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_SYNCOPY_0(P, T, m) \ - digest::Syncmer dig1(test_strs[i], ks[j], m, l, \ - digest::MinimizedHashType::FORWARD); \ - digest::Syncmer dig2(dig1); \ - Syncmer_dig_comp(dig1, dig2); - - do64(TEST_SYNCOPY_0) - } - } - } - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_SYNCOPY_1(P, T, m) \ - std::string str1 = test_strs[i].substr(0, l); \ - std::string str2 = test_strs[i].substr(l, 100); \ - digest::Syncmer dig1(str1, ks[j], m, 0, \ - digest::MinimizedHashType::FORWARD); \ - std::vector vec; \ - dig1.roll_minimizer(1000, vec); \ - dig1.append_seq(str2); \ - digest::Syncmer dig2(dig1); \ - Syncmer_dig_comp(dig1, dig2); - - do64(TEST_SYNCOPY_1) - } - } - } - } - - // had to modify !!! - SECTION("Testing Assignment Operator") { - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_ASSIGNMENT_0(P, T, m) \ - digest::Syncmer *dig1 = new digest::Syncmer( \ - test_strs[i], ks[j], m, l, digest::MinimizedHashType::FORWARD); \ - digest::Syncmer *dig2 = new digest::Syncmer( \ - test_strs[1], 99, m, 0, digest::MinimizedHashType::REVERSE); \ - *dig2 = *dig1; \ - Syncmer_dig_comp(*dig1, *dig2); - - do64(TEST_ASSIGNMENT_0) - } - } - } - - for (int i = 0; i < 7; i += 2) { - for (int j = 0; j < 8; j++) { - for (int l = 15; l < 91; l += 15) { -#define TEST_ASSIGNMENT_1(P, T, m) \ - std::string str1 = test_strs[i].substr(0, l); \ - std::string str2 = test_strs[i].substr(l, 100); \ - digest::Syncmer *dig1 = new digest::Syncmer( \ - str1, ks[j], m, 0, digest::MinimizedHashType::FORWARD); \ - std::vector vec; \ - dig1->roll_minimizer(1000, vec); \ - dig1->append_seq(str2); \ - digest::Syncmer *dig2 = new digest::Syncmer( \ - test_strs[1], 35, m, 0, digest::MinimizedHashType::REVERSE); \ - *dig2 = *dig1; \ - Syncmer_dig_comp(*dig1, *dig2); - - do64(TEST_ASSIGNMENT_1) - } - } - } - } -} - -// #include -// -// template -// class MyClass { -// public: -// void display() { -// std::cout << "Value of k: " << k << std::endl; -// } -// }; -// -// // Template specialization to handle the base case -// template <> -// class MyClass<0> { -// public: -// void display() { -// // Do nothing or handle the base case as needed -// } -// }; -// -// // Recursive template to instantiate objects with values from 1 to 32 -// template -// struct InstantiateObjects { -// static void instantiate() { -// MyClass obj; -// obj.display(); -// InstantiateObjects::instantiate(); -// } -// }; -// -// // Template specialization to handle the base case of recursion -// template <> -// struct InstantiateObjects<0> { -// static void instantiate() { -// // Do nothing or handle the base case as needed -// } -// }; -// -// int main() { -// // Instantiate objects with values from 1 to 32 -// InstantiateObjects<32>::instantiate(); -// -// return 0; -// } -// +#include "digest/data_structure.hpp" +#include "digest/mod_minimizer.hpp" +#include "digest/syncmer.hpp" +#include "digest/window_minimizer.hpp" +#include +#include +#include +#include +#include +#include + +std::vector test_strs; +// used to be first value was 1, but now k must be >= 4 +unsigned ks[] = {4, 4, 7, 8, 9, 16, 25, 64}; + +void setupStrings() { + std::string files[] = { + "../tests/test/A.txt", + "../tests/test/a_lowercase.txt", + "../tests/test/salmonella_enterica.txt", + "../tests/test/salmonella_lowercase.txt", + "../tests/test/random.txt", + "../tests/test/random_lowercase.txt", + "../tests/test/N.txt", + "../tests/test/random_N_to_A.txt", + }; + + for (auto &file : files) { + std::ifstream ifs(file); + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + std::string str; + ifs >> str; + test_strs.push_back(str); + } +} + +template +void base_constructor(digest::Digester

&dig, std::string &str, unsigned k, + size_t pos, digest::MinimizedHashType minimized_h) { + INFO("String is: " << str); + INFO("K is: " << k); + INFO("Pos is: " << dig.get_pos()); + + CHECK(strcmp(str.c_str(), dig.get_sequence()) == 0); + CHECK(str.size() == dig.get_len()); + CHECK(dig.get_k() == k); + CHECK(dig.get_minimized_h() == minimized_h); + if (k <= str.size()) { + nthash::NtHash tHash(str, 1, k, pos); + CHECK(dig.get_is_valid_hash() == tHash.roll()); + if (dig.get_is_valid_hash()) { + CHECK(dig.get_pos() == tHash.get_pos()); + INFO("ntHash pos is: " << tHash.get_pos()); + CHECK(dig.get_fhash() == tHash.get_forward_hash()); + CHECK(dig.get_rhash() == tHash.get_reverse_hash()); + } + } else { + CHECK(dig.get_is_valid_hash() == false); + } +} + +template +void base_constructor_writeover(digest::Digester

&dig, std::string &str, + unsigned k, size_t pos, + digest::MinimizedHashType minimized_h) { + INFO("String is: " << str); + INFO("K is: " << k); + INFO("Pos is: " << dig.get_pos()); + + CHECK(strcmp(str.c_str(), dig.get_sequence()) == 0); + CHECK(str.size() == dig.get_len()); + CHECK(dig.get_k() == k); + CHECK(dig.get_minimized_h() == minimized_h); + if (k <= str.size()) { + nthash::NtHash tHash(test_strs[7], 1, k, pos); + CHECK(dig.get_is_valid_hash() == tHash.roll()); + if (dig.get_is_valid_hash()) { + CHECK(dig.get_pos() == tHash.get_pos()); + INFO("ntHash pos is: " << tHash.get_pos()); + CHECK(dig.get_fhash() == tHash.get_forward_hash()); + CHECK(dig.get_rhash() == tHash.get_reverse_hash()); + } + } else { + CHECK(dig.get_is_valid_hash() == false); + } +} + +template +void base_dig_comp(digest::Digester

&dig1, digest::Digester

&dig2) { + CHECK(strcmp(dig1.get_sequence(), dig2.get_sequence()) == 0); + CHECK(dig1.get_len() == dig2.get_len()); + CHECK(dig1.get_k() == dig2.get_k()); + CHECK(dig1.get_minimized_h() == dig2.get_minimized_h()); + CHECK(dig1.get_is_valid_hash() == dig2.get_is_valid_hash()); + if (dig1.get_is_valid_hash()) { + CHECK(dig1.get_chash() == dig2.get_chash()); + CHECK(dig1.get_fhash() == dig2.get_fhash()); + CHECK(dig1.get_rhash() == dig2.get_rhash()); + } +} + +template +void base_dig_roll(digest::Digester

&dig1, digest::Digester

&dig2) { + while (dig1.get_is_valid_hash()) { + dig1.roll_one(); + dig2.roll_one(); + CHECK(dig1.get_fhash() == dig2.get_fhash()); + CHECK(dig1.get_rhash() == dig2.get_rhash()); + CHECK(dig1.get_pos() == dig2.get_pos()); + } + CHECK(dig1.get_is_valid_hash() == dig2.get_is_valid_hash()); +} + +template +void ModMin_constructor(digest::ModMin

&dig, std::string &str, unsigned k, + size_t pos, digest::MinimizedHashType minimized_h, + uint64_t mod, uint64_t congruence) { + base_constructor(dig, str, k, pos, minimized_h); + CHECK(dig.get_mod() == mod); + CHECK(dig.get_congruence() == congruence); +} + +template +void ModMin_constructor_writeover(digest::ModMin

&dig, std::string &str, + unsigned k, size_t pos, + digest::MinimizedHashType minimized_h, + uint64_t mod, uint64_t congruence) { + base_constructor_writeover(dig, str, k, pos, minimized_h); + CHECK(dig.get_mod() == mod); + CHECK(dig.get_congruence() == congruence); +} + +template +void WindowMin_constructor(digest::WindowMin &dig, std::string &str, + unsigned k, unsigned large_wind_kmer_am, size_t pos, + digest::MinimizedHashType minimized_h) { + base_constructor(dig, str, k, pos, minimized_h); + CHECK(dig.get_large_wind_kmer_am() == large_wind_kmer_am); + // CHECK(dig.get_st_index() == 0); + CHECK(dig.get_ds_size() == 0); + CHECK(dig.get_is_minimized() == false); +} + +template +void ModMin_dig_comp(digest::ModMin

&dig1, digest::ModMin

&dig2) { + base_dig_comp(dig1, dig2); + CHECK(dig1.get_mod() == dig2.get_mod()); + CHECK(dig1.get_congruence() == dig2.get_congruence()); + base_dig_roll(dig1, dig2); +} + +template +void WindowMin_roll_minimizers_comp(digest::WindowMin &dig1, + digest::WindowMin &dig2) { + std::vector vec1; + std::vector vec2; + dig1.roll_minimizer(1000, vec1); + dig2.roll_minimizer(1000, vec2); + REQUIRE(vec1.size() == vec2.size()); + for (size_t i = 0; i < vec1.size(); i++) { + CHECK(vec1[i] == vec2[i]); + } +} + +template +void Syncmer_roll_minimizers_comp(digest::Syncmer &dig1, + digest::Syncmer &dig2) { + std::vector vec1; + std::vector vec2; + dig1.roll_minimizer(1000, vec1); + dig2.roll_minimizer(1000, vec2); + REQUIRE(vec1.size() == vec2.size()); + for (size_t i = 0; i < vec1.size(); i++) { + CHECK(vec1[i] == vec2[i]); + } +} + +template +void WindowMin_dig_comp(digest::WindowMin &dig1, + digest::WindowMin &dig2) { + base_dig_comp(dig1, dig2); + CHECK(dig1.get_large_wind_kmer_am() == dig2.get_large_wind_kmer_am()); + CHECK(dig1.get_ds_size() == dig2.get_ds_size()); + CHECK(dig1.get_is_minimized() == dig2.get_is_minimized()); + // need to use this because I need to check, or at least get some + // indication, of whether the two seg trees are the same + WindowMin_roll_minimizers_comp(dig1, dig2); +} + +template +void Syncmer_dig_comp(digest::Syncmer &dig1, + digest::Syncmer &dig2) { + base_dig_comp(dig1, dig2); + CHECK(dig1.get_large_wind_kmer_am() == dig2.get_large_wind_kmer_am()); + CHECK(dig1.get_ds_size() == dig2.get_ds_size()); + CHECK(dig1.get_is_minimized() == dig2.get_is_minimized()); + // need to use this because I need to check, or at least get some + // indication, of whether the two seg trees are the same + Syncmer_roll_minimizers_comp(dig1, dig2); +} + +template +void roll_one(digest::Digester

&dig, std::string &str, unsigned k) { + INFO(str); + INFO(k); + nthash::NtHash tHash(str, 1, k, 0); + uint64_t true_fhash; + uint64_t true_rhash; + uint64_t dig_fhash; + uint64_t dig_rhash; + bool worked = tHash.roll(); + while ((worked = tHash.roll())) { + dig.roll_one(); + CHECK(dig.get_is_valid_hash() == worked); + if (worked) { + CHECK(dig.get_pos() == tHash.get_pos()); + true_fhash = tHash.get_forward_hash(); + true_rhash = tHash.get_reverse_hash(); + dig_fhash = dig.get_fhash(); + dig_rhash = dig.get_rhash(); + CHECK(dig_fhash == true_fhash); + CHECK(dig_rhash == true_rhash); + } + } + dig.roll_one(); + CHECK(dig.get_is_valid_hash() == worked); +} + +template +void roll_one_write_over(digest::Digester

&dig, std::string &str, + unsigned k) { + INFO(str); + INFO(k); + nthash::NtHash tHash(test_strs[7], 1, k, 0); + uint64_t true_fhash; + uint64_t true_rhash; + uint64_t dig_fhash; + uint64_t dig_rhash; + bool worked = tHash.roll(); + while ((worked = tHash.roll())) { + dig.roll_one(); + CHECK(dig.get_is_valid_hash() == worked); + if (worked) { + CHECK(dig.get_pos() == tHash.get_pos()); + true_fhash = tHash.get_forward_hash(); + true_rhash = tHash.get_reverse_hash(); + dig_fhash = dig.get_fhash(); + dig_rhash = dig.get_rhash(); + CHECK(dig_fhash == true_fhash); + CHECK(dig_rhash == true_rhash); + } + } + dig.roll_one(); + CHECK(dig.get_is_valid_hash() == worked); +} + +template +void ModMin_roll_minimizer(digest::ModMin

&dig, std::string &str, unsigned k, + digest::MinimizedHashType minimized_h, + uint32_t prime) { + nthash::NtHash tHash(str, 1, k, 0); + std::vector positions; + std::vector hashes; + while (tHash.roll()) { + uint32_t temp; + if (minimized_h == digest::MinimizedHashType::CANON) { + temp = *(tHash.hashes()); + } else if (minimized_h == digest::MinimizedHashType::FORWARD) { + temp = tHash.get_forward_hash(); + } else { + temp = tHash.get_reverse_hash(); + } + if (temp % prime == 0) { + positions.push_back(tHash.get_pos()); + hashes.push_back(temp); + } + } + digest::ModMin dig2 = dig; + + std::vector dig_positions; + dig.roll_minimizer(400, dig_positions); + REQUIRE(positions.size() == dig_positions.size()); + for (size_t i = 0; i < positions.size(); i++) { + CHECK(dig_positions[i] == positions[i]); + } + + std::vector> dig_positions2; + dig2.roll_minimizer(400, dig_positions2); + REQUIRE(positions.size() == dig_positions2.size()); + for (size_t i = 0; i < positions.size(); i++) { + CHECK(dig_positions2[i].first == positions[i]); + CHECK(dig_positions2[i].second == hashes[i]); + } +} + +template +void WindowMin_roll_minimizer(digest::WindowMin &dig, std::string &str, + unsigned k, unsigned large_wind_kmer_am, + digest::MinimizedHashType minimized_h) { + nthash::NtHash tHash(str, 1, k, 0); + std::vector> hashes; + while (tHash.roll()) { + uint32_t temp; + if (minimized_h == digest::MinimizedHashType::CANON) { + temp = *(tHash.hashes()); + } else if (minimized_h == digest::MinimizedHashType::FORWARD) { + temp = tHash.get_forward_hash(); + } else { + temp = tHash.get_reverse_hash(); + } + hashes.push_back(std::make_pair(temp, tHash.get_pos())); + } + + std::vector> answers; + std::pair prev; + for (size_t i = 0; i + large_wind_kmer_am <= hashes.size(); i++) { + std::pair temp_pair = hashes[i]; + for (uint j = 1; j < large_wind_kmer_am; j++) { + std::pair curr = hashes[i + j]; + if (curr.first < temp_pair.first) { + temp_pair = curr; + } else if (curr.first == temp_pair.first) { + if (curr.second > temp_pair.second) { + temp_pair = curr; + } + } + } + if (i == 0) { + prev = temp_pair; + answers.push_back(temp_pair); + } else { + if (prev != temp_pair) { + prev = temp_pair; + answers.push_back(temp_pair); + } + } + } + + digest::WindowMin dig2 = dig; + + std::vector wind_mins; + dig.roll_minimizer(1000, wind_mins); + REQUIRE(answers.size() == wind_mins.size()); + for (size_t i = 0; i < answers.size(); i++) { + CHECK(wind_mins[i] == answers[i].second); + } + + std::vector> wind_mins2; + dig2.roll_minimizer(1000, wind_mins2); + REQUIRE(answers.size() == wind_mins2.size()); + for (size_t i = 0; i < answers.size(); i++) { + CHECK(wind_mins2[i].second == answers[i].first); + CHECK(wind_mins2[i].first == answers[i].second); + } +} + +template +void Syncmer_roll_minimizer(digest::Syncmer &dig, std::string &str, + unsigned k, unsigned large_wind_kmer_am, + digest::MinimizedHashType minimized_h) { + nthash::NtHash tHash(str, 1, k, 0); + std::vector> hashes; + while (tHash.roll()) { + uint32_t temp; + if (minimized_h == digest::MinimizedHashType::CANON) { + temp = *(tHash.hashes()); + } else if (minimized_h == digest::MinimizedHashType::FORWARD) { + temp = tHash.get_forward_hash(); + } else { + temp = tHash.get_reverse_hash(); + } + hashes.push_back(std::make_pair(temp, tHash.get_pos())); + } + + std::vector> answers; + for (size_t i = 0; i + large_wind_kmer_am <= hashes.size(); i++) { + uint32_t minAm = hashes[i].first; + + for (uint j = 1; j < large_wind_kmer_am; j++) { + minAm = std::min(minAm, hashes[i + j].first); + } + + if (minAm == hashes[i].first || + minAm == hashes[i + large_wind_kmer_am - 1].first) { + answers.emplace_back(hashes[i].second, minAm); + } + } + + digest::Syncmer dig2 = dig; + + std::vector syncs; + dig.roll_minimizer(1000, syncs); + + assert(answers.size() == syncs.size()); + REQUIRE(answers.size() == syncs.size()); + for (size_t i = 0; i < answers.size(); i++) { + CHECK(syncs[i] == answers[i].first); + } + + std::vector> syncs2; + dig2.roll_minimizer(1000, syncs2); + REQUIRE(answers.size() == syncs2.size()); + for (size_t i = 0; i < answers.size(); i++) { + CHECK(syncs2[i].first == answers[i].first); + CHECK(syncs2[i].second == answers[i].second); + } +} + +template +void append_seq_compare(std::string &str1, std::string &str2, + digest::Digester

&dig, unsigned k) { + INFO(str1); + INFO(str2); + INFO(str1.size()); + INFO(str2.size()); + INFO(k); + + std::string str3 = str1 + str2; + nthash::NtHash tHash(str3, 1, k); + std::vector vec1; + std::vector positions1; + while (tHash.roll()) { + vec1.push_back(*(tHash.hashes())); + positions1.push_back(tHash.get_pos()); + } + std::vector vec2; + std::vector positions2; + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + dig.append_seq(str2); + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + REQUIRE(vec1.size() == vec2.size()); + for (size_t i = 0; i < vec1.size(); i++) { + INFO(i); + CHECK(vec1[i] == vec2[i]); + CHECK(positions1[i] == positions2[i]); + } +} + +template +void append_seq_compare3(std::string &str1, std::string &str2, std::string str3, + digest::Digester

&dig, unsigned k) { + INFO(str1); + INFO(str2); + INFO(str3); + INFO(k); + // Make sure to check positions too + std::string str4 = str1 + str2 + str3; + nthash::NtHash tHash(str4, 1, k); + std::vector vec1; + std::vector positions1; + while (tHash.roll()) { + vec1.push_back(*(tHash.hashes())); + positions1.push_back(tHash.get_pos()); + } + std::vector vec2; + std::vector positions2; + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + dig.append_seq(str2); + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + dig.append_seq(str3); + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + REQUIRE(vec1.size() == vec2.size()); + for (size_t i = 0; i < vec1.size(); i++) { + INFO(i); + CHECK(vec1[i] == vec2[i]); + CHECK(positions1[i] == positions2[i]); + } +} + +void append_seq_small_cases() { + std::string str1 = "CCGTGT"; + std::string str2 = "CCGNGT"; + std::string str3 = "AGCCTT"; + std::string str4 = "ANCCTT"; + std::string str5 = "A"; + + digest::Digester *dig = + new digest::ModMin( + str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare(str1, str3, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare(str2, str4, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare(str2, str3, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare(str2, str5, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare(str1, str5, *dig, 4); + delete dig; +} + +void append_seq_small_cases2() { + std::string str1_good = "CATACCGGT"; + std::string str1_short = "TAG"; + std::string str1_badCh = "CATACNCGGT"; + + std::string str2_good = "GTTCTCGCTT"; + std::string str2_badCh = "GTNTCTCGCTT"; + std::string str2A = "A"; + std::string str2_short = "TGGA"; + + std::string str3_good = "CAACGACCGC"; + std::string str3_badCh = "NCAACGACCGC"; + + digest::Digester *dig = + new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_good, str2_good, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_good, str2_badCh, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_good, str2A, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_short, str2A, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_badCh, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_badCh, str2A, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_good, str2_short, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3(str1_short, str2A, str3_badCh, *dig, 6); + delete dig; +} + +template +void append_seq_compare_write_over(std::string &str1, std::string &str2, + digest::Digester

&dig, unsigned k) { + INFO(str1); + INFO(str2); + INFO(str1.size()); + INFO(str2.size()); + INFO(k); + + std::string str3 = str1 + str2; + for (int i = 0; i < (int)str3.size(); i++) { + if (str3[i] == 'N' || str3[i] == 'n') { + str3[i] = 'A'; + } + } + nthash::NtHash tHash(str3, 1, k); + std::vector vec1; + std::vector positions1; + while (tHash.roll()) { + vec1.push_back(*(tHash.hashes())); + positions1.push_back(tHash.get_pos()); + } + std::vector vec2; + std::vector positions2; + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + dig.append_seq(str2); + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + REQUIRE(vec1.size() == vec2.size()); + for (size_t i = 0; i < vec1.size(); i++) { + INFO(i); + CHECK(vec1[i] == vec2[i]); + CHECK(positions1[i] == positions2[i]); + } +} + +template +void append_seq_compare3_write_over(std::string &str1, std::string &str2, + std::string str3, digest::Digester

&dig, + unsigned k) { + INFO(str1); + INFO(str2); + INFO(str3); + INFO(k); + // Make sure to check positions too + std::string str4 = str1 + str2 + str3; + for (int i = 0; i < (int)str4.size(); i++) { + if (str4[i] == 'N' || str4[i] == 'n') { + str4[i] = 'A'; + } + } + nthash::NtHash tHash(str4, 1, k); + std::vector vec1; + std::vector positions1; + while (tHash.roll()) { + vec1.push_back(*(tHash.hashes())); + positions1.push_back(tHash.get_pos()); + } + std::vector vec2; + std::vector positions2; + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + dig.append_seq(str2); + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + dig.append_seq(str3); + if (dig.get_is_valid_hash()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + while (dig.roll_one()) { + vec2.push_back(dig.get_chash()); + positions2.push_back(dig.get_pos()); + } + } + REQUIRE(vec1.size() == vec2.size()); + for (size_t i = 0; i < vec1.size(); i++) { + INFO(i); + CHECK(vec1[i] == vec2[i]); + CHECK(positions1[i] == positions2[i]); + } +} + +void append_seq_small_cases_write_over() { + std::string str1 = "CCGTGT"; + std::string str2 = "CCGNGT"; + std::string str3 = "AGCCTT"; + std::string str4 = "ANCCTT"; + std::string str5 = "A"; + + digest::Digester *dig = + new digest::ModMin( + str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare_write_over(str1, str3, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare_write_over(str2, str4, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare_write_over(str2, str3, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str2, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare_write_over(str2, str5, *dig, 4); + delete dig; + + dig = new digest::ModMin( + str1, 4, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare_write_over(str1, str5, *dig, 4); + delete dig; +} + +void append_seq_small_cases2_write_over() { + std::string str1_good = "CATACCGGT"; + std::string str1_short = "TAG"; + std::string str1_badCh = "CATACNCGGT"; + + std::string str2_good = "GTTCTCGCTT"; + std::string str2_badCh = "GTNTCTCGCTT"; + std::string str2A = "A"; + std::string str2_short = "TGGA"; + + std::string str3_good = "CAACGACCGC"; + std::string str3_badCh = "NCAACGACCGC"; + + digest::Digester *dig = + new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_good, str2_good, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_good, str2_badCh, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_good, str2A, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_short, str2A, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_badCh, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_badCh, str2A, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_good, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_good, str2_short, str3_good, *dig, 6); + delete dig; + + dig = new digest::ModMin( + str1_short, 6, 17, 0, 0, digest::MinimizedHashType::CANON); + append_seq_compare3_write_over(str1_short, str2A, str3_badCh, *dig, 6); + delete dig; +} +/* + consider re-organizing this so this only tests the UM_Digester specific + stuff like the constructor and roll_minimizer, but put the more general + stuff, append_seq and roll_one in the general testing group +*/ + +TEST_CASE("Digester Testing") { + setupStrings(); + // These use the ModMinimizer Class because Digester can't be instantiated, + // but correctness doesn't depend on any of the fields of functions in the + // ModMinimizer Class + SECTION("Base Constructor Special Cases") { + unsigned k; + digest::MinimizedHashType minimized_h; + uint32_t mod, congruence; + size_t pos; + std::string str; + // string is length 1, k = 1 + str = "AAAA"; + k = 4; + pos = 0; + for (int i = 0; i < 3; i++) { + minimized_h = static_cast(i); + mod = 2; + congruence = 1; + + digest::ModMin *dig = + new digest::ModMin( + str, k, mod, congruence, pos, minimized_h); + ModMin_constructor(*dig, str, k, pos, minimized_h, mod, congruence); + delete dig; + } + + // string is length 1, k = 4 + str = "A"; + k = ks[1]; + pos = 0; + for (int i = 0; i < 3; i++) { + minimized_h = static_cast(i); + mod = 2; + congruence = 1; + + digest::ModMin *dig = + new digest::ModMin( + str, k, mod, congruence, pos, minimized_h); + ModMin_constructor(*dig, str, k, pos, minimized_h, mod, congruence); + delete dig; + } + + for (uint i = 0; i < test_strs.size(); i++) { + for (int j = 0; j < 8; j++) { + k = ks[j]; + for (int l = 0; l < 16; l++) { + pos = l; + for (int p = 0; p < 3; p++) { + minimized_h = static_cast(p); + mod = 1e9 + 7; + congruence = 0; + + digest::ModMin *dig = + new digest::ModMin( + test_strs[i], k, mod, congruence, pos, + minimized_h); + ModMin_constructor(*dig, test_strs[i], k, pos, + minimized_h, mod, congruence); + delete dig; + } + } + } + } + + // test writeover policy + for (int j = 0; j < 8; j++) { + k = ks[j]; + for (int l = 0; l < 16; l++) { + pos = l; + for (int p = 0; p < 3; p++) { + minimized_h = static_cast(p); + mod = 1e9 + 7; + congruence = 0; + + digest::ModMin *dig = + new digest::ModMin( + test_strs[4], k, mod, congruence, pos, minimized_h); + ModMin_constructor_writeover(*dig, test_strs[4], k, pos, + minimized_h, mod, congruence); + delete dig; + } + } + } + + // Throwing Exceptions + // Shouldn't/Doesn't leak any memory + // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c + + str = "ACTGACTG"; + k = 4; + pos = 0; + minimized_h = digest::MinimizedHashType::CANON; + mod = 1e9 + 7; + congruence = 0; + + k = 0; + digest::ModMin *dig; + CHECK_THROWS_AS(dig = + new digest::ModMin( + str, k, mod, congruence, pos, minimized_h), + digest::BadConstructionException); + k = 4; + + // pos >= seq.size() + pos = 8; + CHECK_THROWS_AS(dig = + new digest::ModMin( + str, k, mod, congruence, pos, minimized_h), + digest::BadConstructionException); + pos = 0; + + // minimized_h > 2 + minimized_h = (digest::MinimizedHashType)3; + CHECK_THROWS_AS(dig = + new digest::ModMin( + str, k, mod, congruence, pos, minimized_h), + digest::BadConstructionException); + minimized_h = (digest::MinimizedHashType)0; + } + + SECTION("Testing roll_one") { + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 8; j++) { + digest::ModMin *dig = + new digest::ModMin( + test_strs[i], ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + roll_one(*dig, test_strs[i], ks[j]); + delete dig; + } + } + + // testing roll_one for writeover + for (int j = 0; j < 8; j++) { + digest::ModMin *dig = + new digest::ModMin( + test_strs[4], ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + roll_one_write_over(*dig, test_strs[4], ks[j]); + delete dig; + } + } + + SECTION("Testing append_seq()") { + append_seq_small_cases(); + // Throws NotRolledTillEndException() + digest::ModMin *dig = + new digest::ModMin(test_strs[0], 4, + 17); + CHECK_THROWS_AS(dig->append_seq(test_strs[0]), + digest::NotRolledTillEndException); + delete dig; + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + std::string str1 = test_strs[i].substr(0, l); + std::string str2 = test_strs[i].substr(l, 100); + digest::ModMin *dig = + new digest::ModMin( + str1, ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + append_seq_compare(str1, str2, *dig, ks[j]); + delete dig; + } + } + } + append_seq_small_cases2(); + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + for (int r = 12; r < 85; r += 24) { + std::string str1 = test_strs[i].substr(0, l); + std::string str2 = test_strs[i].substr(l, r); + std::string str3 = test_strs[i].substr(l + r, 75); + digest::ModMin *dig = + new digest::ModMin( + str1, ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + append_seq_compare3(str1, str2, str3, *dig, ks[j]); + delete dig; + } + } + } + } + + // testing append_seq for writeover + append_seq_small_cases_write_over(); + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + std::string str1 = test_strs[4].substr(0, l); + std::string str2 = test_strs[4].substr(l, 100); + digest::ModMin *dig = + new digest::ModMin( + str1, ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + append_seq_compare_write_over(str1, str2, *dig, ks[j]); + delete dig; + } + } + + append_seq_small_cases2_write_over(); + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + for (int r = 12; r < 85; r += 24) { + std::string str1 = test_strs[4].substr(0, l); + std::string str2 = test_strs[4].substr(l, r); + std::string str3 = test_strs[4].substr(l + r, 75); + digest::ModMin *dig = + new digest::ModMin( + str1, ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + append_seq_compare3_write_over(str1, str2, str3, *dig, + ks[j]); + delete dig; + } + } + } + } + + SECTION("Testing new_seq()") { + unsigned k; + std::string str; + // string is length 1, k = 4 + str = "A"; + k = ks[1]; + + digest::ModMin *dig1 = + new digest::ModMin( + test_strs[0], k, 1e9 + 7, 0, 0, + digest::MinimizedHashType::CANON); + dig1->new_seq(str, 0); + base_constructor(*dig1, str, k, 0, digest::MinimizedHashType::CANON); + delete dig1; + + // Throw BadConstructionException() + dig1 = new digest::ModMin( + test_strs[0], k, 1e9 + 7, 0, 0, digest::MinimizedHashType::CANON); + CHECK_THROWS_AS(dig1->new_seq(test_strs[0], 500), + digest::BadConstructionException); + delete dig1; + + for (uint i = 0; i < test_strs.size(); i += 2) { + for (int j = 0; j < 32; j += 8) { + digest::ModMin *dig = + new digest::ModMin( + test_strs[1], ks[3], 1e9 + 7, 0, 0, + digest::MinimizedHashType::CANON); + dig->new_seq(test_strs[i], j); + base_constructor(*dig, test_strs[i], ks[3], j, + digest::MinimizedHashType::CANON); + delete dig; + } + } + + for (uint i = 0; i < test_strs.size(); i += 2) { + for (int l = 13; l <= 78; l += 13) { + digest::ModMin *dig = + new digest::ModMin( + test_strs[5], ks[3], 1e9 + 7, 0, 0, + digest::MinimizedHashType::CANON); + int ind = 0; + while (ind < l && dig->roll_one()) { + ind++; + } + dig->new_seq(test_strs[i], 0); + base_constructor(*dig, test_strs[i], ks[3], 0, + digest::MinimizedHashType::CANON); + delete dig; + } + } + + // new_seq when deque has stuff in it + dig1 = new digest::ModMin( + test_strs[2], 8, 17, 0, 0, digest::MinimizedHashType::CANON); + std::vector vec; + dig1->roll_minimizer(1000, vec); + vec.clear(); + dig1->append_seq(test_strs[2]); + dig1->roll_minimizer(1000, vec); + vec.clear(); + dig1->new_seq(test_strs[4], 0); + base_constructor(*dig1, test_strs[4], 8, 0, + digest::MinimizedHashType::CANON); + delete dig1; + + // new_seq when deque has stuff in it and a new hash can't be properly + // initialized + std::string bad_str = "TTACTNGTACCTG"; + dig1 = new digest::ModMin( + test_strs[2], 8, 17, 0, 0, digest::MinimizedHashType::CANON); + dig1->roll_minimizer(1000, vec); + vec.clear(); + dig1->append_seq(test_strs[2]); + dig1->roll_minimizer(1000, vec); + vec.clear(); + dig1->new_seq(bad_str, 0); + base_constructor(*dig1, bad_str, 8, 0, + digest::MinimizedHashType::CANON); + delete dig1; + } +} + +TEST_CASE("ModMin Testing") { + setupStrings(); + + SECTION("Testing Constructors") { + unsigned k; + digest::MinimizedHashType minimized_h; + uint32_t mod, congruence; + size_t pos; + std::string str; + + // Throwing Exceptions + // Shouldn't/Doesn't leak any memory + // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c + + str = "ACTGACTG"; + k = 4; + pos = 0; + minimized_h = digest::MinimizedHashType::CANON; + digest::ModMin *dig; + + // mod >= congruence + mod = 2; + congruence = 2; + CHECK_THROWS_AS(dig = + new digest::ModMin( + str, k, mod, congruence, pos, minimized_h), + digest::BadModException); + } + + // maybe move this into an entirely new test case, and make this big thing + // just tests for the Dig class + SECTION("Testing roll_minimizer(). The one that takes no parameters") { + uint32_t prime = 17; + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 0; l < 3; l++) { + digest::ModMin *dig = + new digest::ModMin( + test_strs[i], ks[j], prime, 0, 0, + static_cast(l)); + ModMin_roll_minimizer( + *dig, test_strs[i], ks[j], + static_cast(l), prime); + delete dig; + } + } + } + } + + SECTION("Testing Copy Constructor") { + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + digest::ModMin *dig1 = + new digest::ModMin( + test_strs[i], ks[j], 1e9 + 7, 0, l, + digest::MinimizedHashType::FORWARD); + digest::ModMin *dig2 = + new digest::ModMin( + *dig1); + ModMin_dig_comp(*dig1, *dig2); + delete dig1; + delete dig2; + } + } + } + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + std::string str1 = test_strs[i].substr(0, l); + std::string str2 = test_strs[i].substr(l, 100); + digest::ModMin *dig1 = + new digest::ModMin( + str1, ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + std::vector vec; + dig1->roll_minimizer(1000, vec); + dig1->append_seq(str2); + digest::ModMin *dig2 = + new digest::ModMin( + *dig1); + ModMin_dig_comp(*dig1, *dig2); + delete dig1; + delete dig2; + } + } + } + } + + SECTION("Testing Assignment Operator") { + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + digest::ModMin *dig1 = + new digest::ModMin( + test_strs[i], ks[j], 1e9 + 7, 0, l, + digest::MinimizedHashType::FORWARD); + digest::ModMin *dig2 = + new digest::ModMin( + test_strs[1], 99, 98765, 3, 0, + digest::MinimizedHashType::REVERSE); + *dig2 = *dig1; + ModMin_dig_comp(*dig1, *dig2); + delete dig1; + delete dig2; + } + } + } + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { + std::string str1 = test_strs[i].substr(0, l); + std::string str2 = test_strs[i].substr(l, 100); + digest::ModMin *dig1 = + new digest::ModMin( + str1, ks[j], 1e9 + 7, 0, 0, + digest::MinimizedHashType::FORWARD); + std::vector vec; + dig1->roll_minimizer(1000, vec); + dig1->append_seq(str2); + digest::ModMin *dig2 = + new digest::ModMin( + test_strs[1], 99, 98765, 3, 0, + digest::MinimizedHashType::REVERSE); + *dig2 = *dig1; + ModMin_dig_comp(*dig1, *dig2); + delete dig1; + delete dig2; + } + } + } + } +} + +#define do64(F) \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<4>, 4) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<31>, 31) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<32>, 32) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<33>, 33) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<63>, 63) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::SegmentTree<64>, 64) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<4>, 4) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<31>, 31) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<32>, 32) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<33>, 33) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<63>, 63) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive<64>, 64) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<4>, 4) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<31>, 31) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<32>, 32) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<33>, 33) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<63>, 63) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Naive2<64>, 64) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 4) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 31) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 32) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 33) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 63) } \ + { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 64) } + +TEST_CASE("WindowMin Testing") { + SECTION("Constructor Testing") { + unsigned k; + digest::MinimizedHashType minimized_h; + size_t pos; + std::string str; + + // Throwing Exceptions + // Shouldn't/Doesn't leak any memory + // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c + + str = "ACTGACTG"; + k = 4; + pos = 0; + minimized_h = digest::MinimizedHashType::CANON; + /* + #define WC(T) \ + digest::WindowMin* dig1; \ + CHECK_THROWS_AS((dig1 = new digest::WindowMin(str, k, 0, pos, + minimized_h)), digest::BadWindowSizeException); + + WC(data_structure::SegmentTree<0>); + */ + + for (uint i = 0; i < test_strs.size(); i++) { + k = 4; + pos = 0; + minimized_h = digest::MinimizedHashType::CANON; + +#define TEST_CONSTRUCTOR_0(P, T, j) \ + digest::WindowMin *dig = \ + new digest::WindowMin(test_strs[i], k, j, pos, minimized_h); \ + WindowMin_constructor(*dig, test_strs[i], k, j, pos, minimized_h); \ + delete dig; + + do64(TEST_CONSTRUCTOR_0) + } + } + + SECTION("roll_minimizer() testing") { + for (int i = 0; i < 7; i += 2) { + // std::cout << test_strs[i] << std::endl; + for (int j = 0; j < 8; j++) { + for (int l = 0; l < 3; l++) { +#define TEST_ROLL_0(p, m, k) \ + digest::WindowMin *dig = new digest::WindowMin( \ + test_strs[i], ks[j], k, 0, static_cast(l)); \ + WindowMin_roll_minimizer(*dig, test_strs[i], ks[j], k, \ + static_cast(l)); \ + delete dig; + + do64(TEST_ROLL_0) + } + } + } + } + /* + the below also inadverntently tests how append_seq (only the case + that there are 2 sequences involved total) works with roll_minimizer for + WindowMin. In theory this shouldn't be needed and also can't be + considered "thorough", but it is extra assurance. + */ + SECTION("Testing Copy Constructor") { + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_COPY_0(P, T, k) \ + digest::WindowMin dig1(test_strs[i], ks[j], k, l, \ + digest::MinimizedHashType::FORWARD); \ + digest::WindowMin dig2(dig1); \ + WindowMin_dig_comp(dig1, dig2); + + do64(TEST_COPY_0) + } + } + } + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_COPY_1(P, T, k) \ + std::string str1 = test_strs[i].substr(0, l); \ + std::string str2 = test_strs[i].substr(l, 100); \ + digest::WindowMin dig1(str1, ks[j], k, 0, \ + digest::MinimizedHashType::FORWARD); \ + std::vector vec; \ + dig1.roll_minimizer(1000, vec); \ + dig1.append_seq(str2); \ + digest::WindowMin dig2(dig1); \ + WindowMin_dig_comp(dig1, dig2); + + do64(TEST_COPY_1) + } + } + } + } + + SECTION("Testing Assignment Operator") { + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_COPY_2(P, T, k) \ + digest::WindowMin dig1(test_strs[i], ks[j], k, l, \ + digest::MinimizedHashType::FORWARD); \ + digest::WindowMin dig2(test_strs[1], 99, k, 0, \ + digest::MinimizedHashType::REVERSE); \ + dig2 = dig1; \ + WindowMin_dig_comp(dig1, dig2); + + do64(TEST_COPY_2) + } + } + } + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_COPY_3(P, T, m) \ + std::string str1 = test_strs[i].substr(0, l); \ + std::string str2 = test_strs[i].substr(l, 100); \ + digest::WindowMin dig1(str1, ks[j], m, 0, \ + digest::MinimizedHashType::FORWARD); \ + std::vector vec; \ + dig1.roll_minimizer(1000, vec); \ + dig1.append_seq(str2); \ + digest::WindowMin dig2(test_strs[1], 35, m, 0, \ + digest::MinimizedHashType::REVERSE); \ + dig2 = dig1; \ + WindowMin_dig_comp(dig1, dig2); + + do64(TEST_COPY_3) + } + } + } + } +} + +TEST_CASE("Syncmer Testing") { + // Syncmer and WindowMinimizers have all the same class members so I can + // just use the WindowMin tests for Constructor and be ok + SECTION("Constructor Testing") { + + unsigned k; + digest::MinimizedHashType minimized_h; + size_t pos; + std::string str; + + // Throwing Exceptions + // Shouldn't/Doesn't leak any memory + // https://stackoverflow.com/questions/147572/will-the-below-code-cause-memory-leak-in-c + + str = "ACTGACTG"; + k = 4; + pos = 0; + minimized_h = digest::MinimizedHashType::CANON; + // digest::Syncmer>* dig1; + // CHECK_THROWS_AS((dig1 = new + // digest::Syncmer>(str, + // k, 0, pos, minimized_h)), digest::BadWindowSizeException); + + for (uint i = 0; i < test_strs.size(); i++) { +#define TEST_SYNCON(P, T, j) \ + k = 4; \ + pos = 0; \ + minimized_h = digest::MinimizedHashType::CANON; \ + \ + digest::Syncmer dig(test_strs[i], k, j, pos, minimized_h); \ + WindowMin_constructor(dig, test_strs[i], k, j, pos, minimized_h); + + do64(TEST_SYNCON) + } + } + + SECTION("roll_minimizer() testing") { + for (int i = 0; i < 7; i += 2) { + // std::cout << test_strs[i] << std::endl; + for (int j = 0; j < 8; j++) { + for (int l = 0; l < 3; l++) { +#define TEST_SYNCROLL(P, T, m) \ + digest::Syncmer dig(test_strs[i], ks[j], m, 0, \ + static_cast(l)); \ + Syncmer_roll_minimizer(dig, test_strs[i], ks[j], m, \ + static_cast(l)); + + do64(TEST_SYNCROLL) + } + } + } + } + + /* + the below also inadverntently tests how append_seq (only the case + that there are 2 sequences involved total) works with roll_minimizer for + WindowMin. In theory this shouldn't be needed and also can't be + considered "thorough", but it is extra assurance. + */ + SECTION("Testing Copy Constructor") { + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_SYNCOPY_0(P, T, m) \ + digest::Syncmer dig1(test_strs[i], ks[j], m, l, \ + digest::MinimizedHashType::FORWARD); \ + digest::Syncmer dig2(dig1); \ + Syncmer_dig_comp(dig1, dig2); + + do64(TEST_SYNCOPY_0) + } + } + } + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_SYNCOPY_1(P, T, m) \ + std::string str1 = test_strs[i].substr(0, l); \ + std::string str2 = test_strs[i].substr(l, 100); \ + digest::Syncmer dig1(str1, ks[j], m, 0, \ + digest::MinimizedHashType::FORWARD); \ + std::vector vec; \ + dig1.roll_minimizer(1000, vec); \ + dig1.append_seq(str2); \ + digest::Syncmer dig2(dig1); \ + Syncmer_dig_comp(dig1, dig2); + + do64(TEST_SYNCOPY_1) + } + } + } + } + + // had to modify !!! + SECTION("Testing Assignment Operator") { + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_ASSIGNMENT_0(P, T, m) \ + digest::Syncmer *dig1 = new digest::Syncmer( \ + test_strs[i], ks[j], m, l, digest::MinimizedHashType::FORWARD); \ + digest::Syncmer *dig2 = new digest::Syncmer( \ + test_strs[1], 99, m, 0, digest::MinimizedHashType::REVERSE); \ + *dig2 = *dig1; \ + Syncmer_dig_comp(*dig1, *dig2); + + do64(TEST_ASSIGNMENT_0) + } + } + } + + for (int i = 0; i < 7; i += 2) { + for (int j = 0; j < 8; j++) { + for (int l = 15; l < 91; l += 15) { +#define TEST_ASSIGNMENT_1(P, T, m) \ + std::string str1 = test_strs[i].substr(0, l); \ + std::string str2 = test_strs[i].substr(l, 100); \ + digest::Syncmer *dig1 = new digest::Syncmer( \ + str1, ks[j], m, 0, digest::MinimizedHashType::FORWARD); \ + std::vector vec; \ + dig1->roll_minimizer(1000, vec); \ + dig1->append_seq(str2); \ + digest::Syncmer *dig2 = new digest::Syncmer( \ + test_strs[1], 35, m, 0, digest::MinimizedHashType::REVERSE); \ + *dig2 = *dig1; \ + Syncmer_dig_comp(*dig1, *dig2); + + do64(TEST_ASSIGNMENT_1) + } + } + } + } +} + +// #include +// +// template +// class MyClass { +// public: +// void display() { +// std::cout << "Value of k: " << k << std::endl; +// } +// }; +// +// // Template specialization to handle the base case +// template <> +// class MyClass<0> { +// public: +// void display() { +// // Do nothing or handle the base case as needed +// } +// }; +// +// // Recursive template to instantiate objects with values from 1 to 32 +// template +// struct InstantiateObjects { +// static void instantiate() { +// MyClass obj; +// obj.display(); +// InstantiateObjects::instantiate(); +// } +// }; +// +// // Template specialization to handle the base case of recursion +// template <> +// struct InstantiateObjects<0> { +// static void instantiate() { +// // Do nothing or handle the base case as needed +// } +// }; +// +// int main() { +// // Instantiate objects with values from 1 to 32 +// InstantiateObjects<32>::instantiate(); +// +// return 0; +// } +// diff --git a/tests/test/test_thread.cpp b/tests/test/test_thread.cpp index 01aaeb8..db5a3d8 100644 --- a/tests/test/test_thread.cpp +++ b/tests/test/test_thread.cpp @@ -8,348 +8,360 @@ std::vector test_strs; void setupStrings() { - std::string files[] = { - "../tests/test/A.txt", - "../tests/test/a_lowercase.txt", - "../tests/test/salmonella_enterica.txt", - "../tests/test/salmonella_lowercase.txt", - "../tests/test/random.txt", - "../tests/test/random_lowercase.txt", - "../tests/test/N.txt", - }; + std::string files[] = { + "../tests/test/A.txt", + "../tests/test/a_lowercase.txt", + "../tests/test/salmonella_enterica.txt", + "../tests/test/salmonella_lowercase.txt", + "../tests/test/random.txt", + "../tests/test/random_lowercase.txt", + "../tests/test/N.txt", + }; - for (auto &file : files) { - std::ifstream ifs(file); - ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); - std::string str; - ifs >> str; - test_strs.push_back(str); - } + for (auto &file : files) { + std::ifstream ifs(file); + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + std::string str; + ifs >> str; + test_strs.push_back(str); + } } template std::vector multi_to_single_vec(std::vector> vec) { - std::vector ret_vec; - for (size_t i = 0; i < vec.size(); i++) { - for (auto a : vec[i]) { - ret_vec.push_back(a); - } - } - return ret_vec; + std::vector ret_vec; + for (size_t i = 0; i < vec.size(); i++) { + for (auto a : vec[i]) { + ret_vec.push_back(a); + } + } + return ret_vec; } void test_thread_mod(unsigned thread_count, std::string str, unsigned k, - uint64_t mod, uint64_t congruence, size_t start, - digest::MinimizedHashType minimized_h) { - std::vector single_thread; - std::vector> vec; - digest::ModMin dig(str, k, mod, congruence, - start, minimized_h); - dig.roll_minimizer(str.size(), single_thread); - thread_out::thread_mod( - thread_count, vec, str, k, mod, congruence, start, minimized_h); - std::vector multi_thread = multi_to_single_vec(vec); + uint64_t mod, uint64_t congruence, size_t start, + digest::MinimizedHashType minimized_h) { + std::vector single_thread; + std::vector> vec; + digest::ModMin dig(str, k, mod, congruence, + start, minimized_h); + dig.roll_minimizer(str.size(), single_thread); + digest::thread_out::thread_mod( + thread_count, vec, str, k, mod, congruence, start, minimized_h); + std::vector multi_thread = multi_to_single_vec(vec); - REQUIRE(single_thread.size() == multi_thread.size()); - for (size_t i = 0; i < single_thread.size(); i++) { - CHECK(single_thread[i] == multi_thread[i]); - } + REQUIRE(single_thread.size() == multi_thread.size()); + for (size_t i = 0; i < single_thread.size(); i++) { + CHECK(single_thread[i] == multi_thread[i]); + } } void test_thread_wind(unsigned thread_count, std::string str, unsigned k, - unsigned large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h) { - std::vector single_thread; - std::vector> vec; - digest::WindowMin dig( - str, k, large_wind_kmer_am, start, minimized_h); - dig.roll_minimizer(str.size(), single_thread); - thread_out::thread_wind( - thread_count, vec, str, k, large_wind_kmer_am, start, minimized_h); - std::vector multi_thread = multi_to_single_vec(vec); + unsigned large_wind_kmer_am, size_t start, + digest::MinimizedHashType minimized_h) { + std::vector single_thread; + std::vector> vec; + digest::WindowMin + dig(str, k, large_wind_kmer_am, start, minimized_h); + dig.roll_minimizer(str.size(), single_thread); + digest::thread_out::thread_wind( + thread_count, vec, str, k, large_wind_kmer_am, start, minimized_h); + std::vector multi_thread = multi_to_single_vec(vec); - REQUIRE(single_thread.size() == multi_thread.size()); - for (size_t i = 0; i < single_thread.size(); i++) { - CHECK(single_thread[i] == multi_thread[i]); - } + REQUIRE(single_thread.size() == multi_thread.size()); + for (size_t i = 0; i < single_thread.size(); i++) { + CHECK(single_thread[i] == multi_thread[i]); + } } void test_thread_sync(unsigned thread_count, std::string str, unsigned k, - unsigned large_wind_kmer_am, size_t start, - digest::MinimizedHashType minimized_h) { - std::vector single_thread; - std::vector> vec; - digest::Syncmer dig( - str, k, large_wind_kmer_am, start, minimized_h); - dig.roll_minimizer(str.size(), single_thread); - thread_out::thread_sync( - thread_count, vec, str, k, large_wind_kmer_am, start, minimized_h); - std::vector multi_thread = multi_to_single_vec(vec); + unsigned large_wind_kmer_am, size_t start, + digest::MinimizedHashType minimized_h) { + std::vector single_thread; + std::vector> vec; + digest::Syncmer dig( + str, k, large_wind_kmer_am, start, minimized_h); + dig.roll_minimizer(str.size(), single_thread); + digest::thread_out::thread_sync( + thread_count, vec, str, k, large_wind_kmer_am, start, minimized_h); + std::vector multi_thread = multi_to_single_vec(vec); - REQUIRE(single_thread.size() == multi_thread.size()); - for (size_t i = 0; i < single_thread.size(); i++) { - CHECK(single_thread[i] == multi_thread[i]); - } + REQUIRE(single_thread.size() == multi_thread.size()); + for (size_t i = 0; i < single_thread.size(); i++) { + CHECK(single_thread[i] == multi_thread[i]); + } } TEST_CASE("thread_mod function testing") { - setupStrings(); - SECTION("Throw Errors") { - std::string str = "ACTGACTG"; - unsigned thread_count = 4; - std::vector> vec; - unsigned k = 4; - uint64_t mod = 1e9 + 7; - uint64_t congruence = 0; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; - // k < 4 - k = 3; - CHECK_THROWS_AS( - thread_out::thread_mod( - thread_count, vec, str, k, mod, congruence, start, minimized_h), - thread_out::BadThreadOutParams); - k = 4; - // start >= len - start = str.size(); - CHECK_THROWS_AS( - thread_out::thread_mod( - thread_count, vec, str, k, mod, congruence, start, minimized_h), - thread_out::BadThreadOutParams); - start = 0; - // num_kmers is negative - start = 7; - CHECK_THROWS_AS( - thread_out::thread_mod( - thread_count, vec, str, k, mod, congruence, start, minimized_h), - thread_out::BadThreadOutParams); - start = 0; - // num_kmers < thread_count - thread_count = 6; - CHECK_THROWS_AS( - thread_out::thread_mod( - thread_count, vec, str, k, mod, congruence, start, minimized_h), - thread_out::BadThreadOutParams); - thread_count = 4; - } + setupStrings(); + SECTION("Throw Errors") { + std::string str = "ACTGACTG"; + unsigned thread_count = 4; + std::vector> vec; + unsigned k = 4; + uint64_t mod = 1e9 + 7; + uint64_t congruence = 0; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; + // k < 4 + k = 3; + CHECK_THROWS_AS( + digest::thread_out::thread_mod( + thread_count, vec, str, k, mod, congruence, start, minimized_h), + digest::thread_out::BadThreadOutParams); + k = 4; + // start >= len + start = str.size(); + CHECK_THROWS_AS( + digest::thread_out::thread_mod( + thread_count, vec, str, k, mod, congruence, start, minimized_h), + digest::thread_out::BadThreadOutParams); + start = 0; + // num_kmers is negative + start = 7; + CHECK_THROWS_AS( + digest::thread_out::thread_mod( + thread_count, vec, str, k, mod, congruence, start, minimized_h), + digest::thread_out::BadThreadOutParams); + start = 0; + // num_kmers < thread_count + thread_count = 6; + CHECK_THROWS_AS( + digest::thread_out::thread_mod( + thread_count, vec, str, k, mod, congruence, start, minimized_h), + digest::thread_out::BadThreadOutParams); + thread_count = 4; + } - SECTION("Special Cases") { - unsigned thread_count = 4; - unsigned k = 4; - uint64_t mod = 3; - uint64_t congruence = 0; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; - // only 1 thread - thread_count = 1; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_mod(thread_count, str, k, mod, congruence, start, - minimized_h); - } - } + SECTION("Special Cases") { + unsigned thread_count = 4; + unsigned k = 4; + uint64_t mod = 3; + uint64_t congruence = 0; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; + // only 1 thread + thread_count = 1; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_mod(thread_count, str, k, mod, congruence, start, + minimized_h); + } + } - // each thread gets 1 kmer - thread_count = 96; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_mod(thread_count, str, k, mod, congruence, start, - minimized_h); - } - } + // each thread gets 1 kmer + thread_count = 96; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_mod(thread_count, str, k, mod, congruence, start, + minimized_h); + } + } - // some threads get 2 kmers, the rest get 1 - thread_count = 50; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_mod(thread_count, str, k, mod, congruence, start, - minimized_h); - } - } - } + // some threads get 2 kmers, the rest get 1 + thread_count = 50; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_mod(thread_count, str, k, mod, congruence, start, + minimized_h); + } + } + } - SECTION("Full Testing") { - unsigned thread_count = 4; - unsigned k = 4; - uint64_t mod = 17; - uint64_t congruence = 0; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; - // the string changes - // thread_count changes - // start changes - for (int i = 0; i < 4; i++) { - for (int i = 0; i < 4; i += 2) { - for (int j = 4; j <= 64; j += 4) { - thread_count = j; - for (int l = 0; l <= 96; l += 13) { - start = l; - test_thread_mod(thread_count, test_strs[i], k, mod, congruence, - start, minimized_h); - } - } - } - } - } + SECTION("Full Testing") { + unsigned thread_count = 4; + unsigned k = 4; + uint64_t mod = 17; + uint64_t congruence = 0; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; + // the string changes + // thread_count changes + // start changes + for (int i = 0; i < 4; i++) { + for (int i = 0; i < 4; i += 2) { + for (int j = 4; j <= 64; j += 4) { + thread_count = j; + for (int l = 0; l <= 96; l += 13) { + start = l; + test_thread_mod(thread_count, test_strs[i], k, mod, + congruence, start, minimized_h); + } + } + } + } + } } TEST_CASE("thread_wind function testing") { - setupStrings(); - SECTION("Throw Errors") { - std::string str = "ACTGACTGACTG"; - unsigned thread_count = 4; - std::vector> vec; - unsigned k = 4; - const uint32_t large_wind_kmer_am = 4; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; + setupStrings(); + SECTION("Throw Errors") { + std::string str = "ACTGACTGACTG"; + unsigned thread_count = 4; + std::vector> vec; + unsigned k = 4; + const uint32_t large_wind_kmer_am = 4; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; - // num_lwinds is negative - start = 9; - CHECK_THROWS_AS( - (thread_out::thread_wind( - thread_count, vec, str, k, large_wind_kmer_am, start, minimized_h)), - thread_out::BadThreadOutParams); - start = 0; - // num_lwinds < thread_count - thread_count = 8; + // num_lwinds is negative + start = 9; + CHECK_THROWS_AS( + (digest::thread_out::thread_wind( + thread_count, vec, str, k, large_wind_kmer_am, start, + minimized_h)), + digest::thread_out::BadThreadOutParams); + start = 0; + // num_lwinds < thread_count + thread_count = 8; - CHECK_THROWS_AS( - (thread_out::thread_wind( - thread_count, vec, str, k, large_wind_kmer_am, start, minimized_h)), - thread_out::BadThreadOutParams); - thread_count = 4; - } + CHECK_THROWS_AS( + (digest::thread_out::thread_wind( + thread_count, vec, str, k, large_wind_kmer_am, start, + minimized_h)), + digest::thread_out::BadThreadOutParams); + thread_count = 4; + } - SECTION("Special Cases") { - unsigned thread_count = 4; - unsigned k = 4; - const uint32_t large_wind_kmer_am = 8; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; + SECTION("Special Cases") { + unsigned thread_count = 4; + unsigned k = 4; + const uint32_t large_wind_kmer_am = 8; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; - // only 1 thread - thread_count = 1; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_wind(thread_count, str, k, large_wind_kmer_am, start, - minimized_h); - } - } + // only 1 thread + thread_count = 1; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_wind(thread_count, str, k, large_wind_kmer_am, + start, minimized_h); + } + } - // each thread gets 1 lwind - thread_count = 86; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_wind(thread_count, str, k, large_wind_kmer_am, start, - minimized_h); - } - } + // each thread gets 1 lwind + thread_count = 86; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_wind(thread_count, str, k, large_wind_kmer_am, + start, minimized_h); + } + } - // some threads get 2 kmers, the rest get 1 - thread_count = 50; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_wind(thread_count, str, k, large_wind_kmer_am, start, - minimized_h); - } - } - } + // some threads get 2 kmers, the rest get 1 + thread_count = 50; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_wind(thread_count, str, k, large_wind_kmer_am, + start, minimized_h); + } + } + } - SECTION("Full Testing") { - unsigned thread_count = 4; - unsigned k = 4; - const uint32_t large_wind_kmer_am = 8; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; - // the string changes - // thread_count changes - // start changes - for (int i = 0; i < 4; i++) { - for (int i = 0; i < 4; i += 2) { - for (int j = 4; j <= 64; j += 4) { - thread_count = j; - for (int l = 0; l <= 96; l += 13) { - start = l; - test_thread_wind(thread_count, test_strs[i], k, large_wind_kmer_am, - start, minimized_h); - } - } - } - } - } + SECTION("Full Testing") { + unsigned thread_count = 4; + unsigned k = 4; + const uint32_t large_wind_kmer_am = 8; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; + // the string changes + // thread_count changes + // start changes + for (int i = 0; i < 4; i++) { + for (int i = 0; i < 4; i += 2) { + for (int j = 4; j <= 64; j += 4) { + thread_count = j; + for (int l = 0; l <= 96; l += 13) { + start = l; + test_thread_wind(thread_count, test_strs[i], k, + large_wind_kmer_am, start, + minimized_h); + } + } + } + } + } } TEST_CASE("thread_sync function testing") { - setupStrings(); - SECTION("Special Cases") { - unsigned thread_count = 4; - unsigned k = 4; - const uint32_t large_wind_kmer_am = 8; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; + setupStrings(); + SECTION("Special Cases") { + unsigned thread_count = 4; + unsigned k = 4; + const uint32_t large_wind_kmer_am = 8; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; - // only 1 thread - thread_count = 1; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_sync(thread_count, str, k, large_wind_kmer_am, start, - minimized_h); - } - } + // only 1 thread + thread_count = 1; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_sync(thread_count, str, k, large_wind_kmer_am, + start, minimized_h); + } + } - // each thread gets 1 lwind - thread_count = 86; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_sync(thread_count, str, k, large_wind_kmer_am, start, - minimized_h); - } - } + // each thread gets 1 lwind + thread_count = 86; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_sync(thread_count, str, k, large_wind_kmer_am, + start, minimized_h); + } + } - // some threads get 2 kmers, the rest get 1 - thread_count = 50; - for (int i = 0; i < 10; i++) { - for (int i = 0; i < 4; i += 2) { - std::string str = test_strs[i].substr(start, 99); - test_thread_sync(thread_count, str, k, large_wind_kmer_am, start, - minimized_h); - } - } - } + // some threads get 2 kmers, the rest get 1 + thread_count = 50; + for (int i = 0; i < 10; i++) { + for (int i = 0; i < 4; i += 2) { + std::string str = test_strs[i].substr(start, 99); + test_thread_sync(thread_count, str, k, large_wind_kmer_am, + start, minimized_h); + } + } + } - SECTION("Full Testing") { - unsigned thread_count = 4; - unsigned k = 4; - const int32_t large_wind_kmer_am = 8; - size_t start = 0; - digest::MinimizedHashType minimized_h = digest::MinimizedHashType::CANON; - // the string changes - // thread_count changes - // start changes - for (int i = 0; i < 4; i++) { - for (int i = 0; i < 4; i += 2) { - for (int j = 4; j <= 64; j += 4) { - thread_count = j; - for (int l = 0; l <= 96; l += 13) { - start = l; - test_thread_sync(thread_count, test_strs[i], k, large_wind_kmer_am, - start, minimized_h); - } - } - } - } - } + SECTION("Full Testing") { + unsigned thread_count = 4; + unsigned k = 4; + const int32_t large_wind_kmer_am = 8; + size_t start = 0; + digest::MinimizedHashType minimized_h = + digest::MinimizedHashType::CANON; + // the string changes + // thread_count changes + // start changes + for (int i = 0; i < 4; i++) { + for (int i = 0; i < 4; i += 2) { + for (int j = 4; j <= 64; j += 4) { + thread_count = j; + for (int l = 0; l <= 96; l += 13) { + start = l; + test_thread_sync(thread_count, test_strs[i], k, + large_wind_kmer_am, start, + minimized_h); + } + } + } + } + } }