diff --git a/README.md b/README.md index 789cd17..3967463 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,82 @@ -# Digest -C++ library which supports various minimizer schemes for digestion of DNA sequences +# ✂️ Digest: fast, multi-use $k$-mer sub-sampling library +

+ image1 +
+ Visualization of different minimizer schemes supported in Digest and code example using library +

+ + +## What is the Digest library? +- a `C++` library that supports various sub-sampling schemes for $k$-mers in DNA sequences. + - `Digest` library utilizes the rolling hash-function from [ntHash](https://github.com/bcgsc/ntHash) to order the $k$-mers in a window. + +## How to install and build into your project? +image2 + +### Step 1: Install library + +After cloning from GitHub, we use the [Meson](https://mesonbuild.com) build-system to install the library. +- `PREFIX` is an absolute path to library files will be install (`*.h` and `*.a` files) + - **IMPORTANT**: `PREFIX` should not be the root directory of the `Digest/` repo to avoid any issues with installation. +- These commands generate an `include` and `lib` folders in `PREFIX` folder + +```bash +git clone https://github.com/VeryAmazed/digest.git + +meson setup --prefix= --buildtype=release build +meson install -C build +``` + +### Step 2: Include Digest in your project + +#### (a) Using `Meson`: + +If your coding project uses `Meson` to build the executable(s), you can include a file called `subprojects/digest.wrap` in your repository and let Meson install it for you. + +#### (b) Using `g++`: + +To use Digest in your C++ project, you just need to include the header files (`*.h`) and library file (`*.a`) that were installed in the first step. Assuming that `install/` is the directory you installed them in, here is how you can compile. + +```bash +g++ -std=c++17 -o main main.cpp -I install/include/ -L install/lib -lnthash +``` + +## Detailed Look at Example Usage (2 ways): + +There are three types of minimizer schemes that can be used: + +1. Windowed Minimizer +2. Modimizer +3. Syncmer + +The general steps to use Digest is as follows: (1) include the relevant header files, (2) declare the Digest object and (3) find the positions where the minimizers are present in the sequence. + +### 1. Find positions of minimizers: +```cpp +#include "digest/digester.hpp" +#include "digest/window_minimizer.hpp" + +digest::WindowMin digester (dna, 15, 7); + +std::vector output; +digester.roll_minimizer(100, output); +``` +- This code snippet will find up to 100 Windowed Minimizers and store their positions in the vector called `output`. +- `digest::BadCharPolicy::WRITEOVER` means that anytime the code encounters an non-`ACTG` character, it will replace it with an `A`. + - `digest::BadCharPolicy::SKIPOVER` will skip any $k$-mers with non-`ACTG` characters +- `digest::ds::Adaptive` is our recommended data-structure for finding the minimum value in a window (see wiki for other options) + +### 2. Find both positions and hash values of minimizers +If you would like to obtain both the positions and hash values for each minimizer, you can pass a vector of paired integers to do so. + +``` +std::vector> output; +digester.roll_minimizer(100, output); +``` + + + diff --git a/include/digest/data_structure.hpp b/include/digest/data_structure.hpp index 34f097b..9070ba8 100644 --- a/include/digest/data_structure.hpp +++ b/include/digest/data_structure.hpp @@ -115,7 +115,7 @@ template struct Naive { std::array arr; unsigned int i = 0; - Naive(uint32_t){}; + Naive(uint32_t) {}; Naive(const Naive &other) = default; Naive &operator=(const Naive &other) = default; @@ -183,7 +183,7 @@ template struct Naive2 { unsigned int last = 0; std::vector arr = std::vector(k); - Naive2(uint32_t){}; + Naive2(uint32_t) {}; Naive2(const Naive2 &other) = default; Naive2 &operator=(const Naive2 &other) = default; diff --git a/tests/density/ACTG.cpp b/tests/density/ACTG.cpp index 626117d..b06c8fc 100644 --- a/tests/density/ACTG.cpp +++ b/tests/density/ACTG.cpp @@ -31,83 +31,87 @@ int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; int main() { - std::cout << std::fixed << std::setprecision(8); - // if you use ld, use the above and don't use string stream - - std::string str; - - std::vector strs; - assert(freopen("../tests/density/ACTG.txt", "r", stdin)); - for (int i = 0; i < 100; i++) { - std::cin >> str; - strs.pb(str); - } - - std::vector> mod_min_vec(4, std::vector()); - std::vector> wind_min_vec(4, std::vector()); - std::vector> sync_vec(4, std::vector()); - - uint64_t mods[4] = {109, 128, 1009, 1024}; - unsigned l_winds[4] = {7, 8, 17, 16}; - - double kmers = 100000 - 16 + 1; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::ModMin mm( - strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); - std::vector temp; - mm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers; - mod_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::WindowMin - wm(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - wm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers; - wind_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::Syncmer - syn(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - syn.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers; - sync_vec[i].pb(am); - } - } - assert(freopen("../tests/density/out1.txt", "w", stdout)); - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << mod_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << wind_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << sync_vec[i][j] << " "; - } - std::cout << std::endl; - } - - return 0; + std::cout << std::fixed << std::setprecision(8); + // if you use ld, use the above and don't use string stream + + std::string str; + + std::vector strs; + assert(freopen("../tests/density/ACTG.txt", "r", stdin)); + for (int i = 0; i < 100; i++) { + std::cin >> str; + strs.pb(str); + } + + std::vector> mod_min_vec(4, std::vector()); + std::vector> wind_min_vec(4, std::vector()); + std::vector> sync_vec(4, std::vector()); + + uint64_t mods[4] = {109, 128, 1009, 1024}; + unsigned l_winds[4] = {7, 8, 17, 16}; + + double kmers = 100000 - 16 + 1; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::ModMin mm( + strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); + std::vector temp; + mm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers; + mod_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::WindowMin + wm(strs[j], 16, l_winds[i], 0, + digest::MinimizedHashType::CANON); + std::vector temp; + wm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers; + wind_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::Syncmer + syn(strs[j], 16, l_winds[i], 0, + digest::MinimizedHashType::CANON); + std::vector temp; + syn.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers; + sync_vec[i].pb(am); + } + } + assert(freopen("../tests/density/out1.txt", "w", stdout)); + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << mod_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << wind_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << sync_vec[i][j] << " "; + } + std::cout << std::endl; + } + + return 0; } \ No newline at end of file diff --git a/tests/density/non-ACTG.cpp b/tests/density/non-ACTG.cpp index df255f7..7bf42cb 100644 --- a/tests/density/non-ACTG.cpp +++ b/tests/density/non-ACTG.cpp @@ -54,105 +54,109 @@ int dir2[] = {0, 1, 0, -1, 1, -1, 1, -1}; int main() { - std::cout << std::fixed << std::setprecision(8); - // if you use ld, use the above and don't use string stream - - std::string str; - - std::vector strs; - assert(freopen("../tests/density/non-ACTG.txt", "r", stdin)); - for (int i = 0; i < 100; i++) { - std::cin >> str; - strs.pb(str); - } - - std::vector> mod_min_vec(4, std::vector()); - std::vector> wind_min_vec(4, std::vector()); - std::vector> sync_vec(4, std::vector()); - - uint64_t mods[4] = {109, 128, 1009, 1024}; - unsigned l_winds[4] = {7, 8, 17, 16}; - - std::vector kmers(100, 0); - - for (int i = 0; i < 100; i++) { - int start = 0; - while (start + 7 < 1e5) { - bool works = true; - for (int j = 0; j < 16; j++) { - if (strs[i][start + j] == 'N') { - works = false; - start = start + j; - break; - } - } - if (works) { - kmers[i]++; - } - start++; - } - // std::cout << kmers[i] << " "; - } - // std::cout << std::endl; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::ModMin mm( - strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); - std::vector temp; - mm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers[i]; - mod_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::WindowMin - wm(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - wm.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers[i]; - - wind_min_vec[i].pb(am); - } - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 100; j++) { - digest::Syncmer - syn(strs[j], 16, l_winds[i], 0, digest::MinimizedHashType::CANON); - std::vector temp; - syn.roll_minimizer(100000, temp); - double am = temp.size(); - am /= kmers[i]; - - sync_vec[i].pb(am); - } - } - assert(freopen("../tests/density/out2.txt", "w", stdout)); - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << mod_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << wind_min_vec[i][j] << " "; - } - std::cout << std::endl; - } - - for (int i = 0; i < 4; i++) { - for (size_t j = 0; j < 100; j++) { - std::cout << sync_vec[i][j] << " "; - } - std::cout << std::endl; - } - - return 0; + std::cout << std::fixed << std::setprecision(8); + // if you use ld, use the above and don't use string stream + + std::string str; + + std::vector strs; + assert(freopen("../tests/density/non-ACTG.txt", "r", stdin)); + for (int i = 0; i < 100; i++) { + std::cin >> str; + strs.pb(str); + } + + std::vector> mod_min_vec(4, std::vector()); + std::vector> wind_min_vec(4, std::vector()); + std::vector> sync_vec(4, std::vector()); + + uint64_t mods[4] = {109, 128, 1009, 1024}; + unsigned l_winds[4] = {7, 8, 17, 16}; + + std::vector kmers(100, 0); + + for (int i = 0; i < 100; i++) { + int start = 0; + while (start + 7 < 1e5) { + bool works = true; + for (int j = 0; j < 16; j++) { + if (strs[i][start + j] == 'N') { + works = false; + start = start + j; + break; + } + } + if (works) { + kmers[i]++; + } + start++; + } + // std::cout << kmers[i] << " "; + } + // std::cout << std::endl; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::ModMin mm( + strs[j], 16, mods[i], 0, 0, digest::MinimizedHashType::CANON); + std::vector temp; + mm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers[i]; + mod_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::WindowMin + wm(strs[j], 16, l_winds[i], 0, + digest::MinimizedHashType::CANON); + std::vector temp; + wm.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers[i]; + + wind_min_vec[i].pb(am); + } + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 100; j++) { + digest::Syncmer + syn(strs[j], 16, l_winds[i], 0, + digest::MinimizedHashType::CANON); + std::vector temp; + syn.roll_minimizer(100000, temp); + double am = temp.size(); + am /= kmers[i]; + + sync_vec[i].pb(am); + } + } + assert(freopen("../tests/density/out2.txt", "w", stdout)); + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << mod_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << wind_min_vec[i][j] << " "; + } + std::cout << std::endl; + } + + for (int i = 0; i < 4; i++) { + for (size_t j = 0; j < 100; j++) { + std::cout << sync_vec[i][j] << " "; + } + std::cout << std::endl; + } + + return 0; } \ No newline at end of file diff --git a/tests/test/test.cpp b/tests/test/test.cpp index cc1c72b..00ca8ed 100644 --- a/tests/test/test.cpp +++ b/tests/test/test.cpp @@ -1246,7 +1246,7 @@ TEST_CASE("ModMin Testing") { { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 32) } \ { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 33) } \ { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 63) } \ - { F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 64) } + {F(digest::BadCharPolicy::SKIPOVER, digest::ds::Adaptive, 64)} TEST_CASE("WindowMin Testing") { SECTION("Constructor Testing") {