diff --git a/.github/workflows/make_release.yml b/.github/workflows/make_release.yml new file mode 100644 index 0000000..50f374b --- /dev/null +++ b/.github/workflows/make_release.yml @@ -0,0 +1,128 @@ +name: Build binaries +on: + push: + tags: + - "v*.*.*" + branches: + - msweep-release-testing + +jobs: + build_linux-x86_64: + runs-on: ubuntu-latest + container: phusion/holy-build-box-64:3.0.2 + steps: + - name: Install wget + id: install-wget + run: yum install -y wget + + - name: Create io directory + id: mkdir-io + run: mkdir /io && cd /io + + - name: Download build script + id: dl-build-script + run: wget https://raw.githubusercontent.com/tmaklin/biobins/master/linux/mSWEEP/build.sh + + - name: Compile binary in Holy Build Box container + id: compile-in-container + run: chmod +x build.sh && ./build.sh ${{ github.ref_name }} + + - name: Upload linux-x86_64 binary + if: success() + uses: actions/upload-artifact@v3 + with: + name: mSWEEP-${{ github.ref_name }}-x86_64-redhat-linux + path: /io/mSWEEP-${{ github.ref_name }}-x86_64-redhat-linux.tar.gz + + build_macOS-x86_64: + runs-on: ubuntu-latest + container: ghcr.io/shepherdjerred/macos-cross-compiler:latest + steps: + - name: Install wget + id: install-wget + run: apt install -y wget + + - name: Create io directory + id: mkdir-io + run: mkdir /io && cd /io + + - name: Download toolchain file + id: dl-toolchain-file + run: wget https://raw.githubusercontent.com/tmaklin/biobins/master/macOS/x86-64-toolchain.cmake && cp x86-64-toolchain.cmake /io/x86-64-toolchain.cmake && cp x86-64-toolchain.cmake /x86-64-toolchain.cmake + + - name: Download build script + id: dl-build-script + run: wget https://raw.githubusercontent.com/tmaklin/biobins/master/macOS/mSWEEP/build.sh + + - name: Compile binary in macOS Cross Compiler container + id: compile-in-container + run: chmod +x build.sh && ./build.sh ${{ github.ref_name }} x86-64 + + - name: Upload macOS-x86_64 binary + if: success() + uses: actions/upload-artifact@v3 + with: + name: mSWEEP-${{ github.ref_name }}-x86_64-apple-darwin22 + path: /io/mSWEEP-${{ github.ref_name }}-x86_64-apple-darwin22.tar.gz + + build_macOS-arm64: + runs-on: ubuntu-latest + container: ghcr.io/shepherdjerred/macos-cross-compiler:latest + steps: + - name: Install wget + id: install-wget + run: apt install -y wget + + - name: Create io directory + id: mkdir-io + run: mkdir /io && cd /io + + - name: Download toolchain file + id: dl-toolchain-file + run: wget https://raw.githubusercontent.com/tmaklin/biobins/master/macOS/arm64-toolchain.cmake && cp arm64-toolchain.cmake /io/arm64-toolchain.cmake && cp arm64-toolchain.cmake /arm64-toolchain.cmake + + - name: Download build script + id: dl-build-script + run: wget https://raw.githubusercontent.com/tmaklin/biobins/master/macOS/mSWEEP/build.sh + + - name: Compile binary in macOS Cross Compiler container + id: compile-in-container + run: chmod +x build.sh && ./build.sh ${{ github.ref_name }} arm64 + + - name: Upload macOS-arm64 binary + if: success() + uses: actions/upload-artifact@v3 + with: + name: mSWEEP-${{ github.ref_name }}-arm64-apple-darwin22 + path: /io/mSWEEP-${{ github.ref_name }}-arm64-apple-darwin22.tar.gz + + create-release: + runs-on: ubuntu-latest + + needs: [ build_linux-x86_64, build_macOS-x86_64, build_macOS-arm64 ] + + steps: + - uses: actions/checkout@v2 + + - uses: actions/download-artifact@v2 + with: + path: build + + - name: Organise files + shell: bash + run: | + cp build/mSWEEP-${{ github.ref_name }}-arm64-apple-darwin22/mSWEEP-${{ github.ref_name }}-arm64-apple-darwin22.tar.gz . + cp build/mSWEEP-${{ github.ref_name }}-x86_64-apple-darwin22/mSWEEP-${{ github.ref_name }}-x86_64-apple-darwin22.tar.gz . + cp build/mSWEEP-${{ github.ref_name }}-x86_64-redhat-linux/mSWEEP-${{ github.ref_name }}-x86_64-redhat-linux.tar.gz . + + - name: Create release + id: create_release + uses: softprops/action-gh-release@v1 + with: + name: Release ${{ github.ref_name }} + draft: false + prerelease: false + fail_on_unmatched_files: true + generate_release_notes: true + files: | + mSWEEP-*.tar.gz \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 75e8ab1..f8564f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.11) project(mSWEEP) +include(FetchContent) ## Determine build type if(NOT CMAKE_BUILD_TYPE) @@ -22,6 +23,8 @@ if(CMAKE_BUILD_TYPE MATCHES Release) endif() endif() +set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH} ${CURRENT_BINARY_DIR}/cmake") + if(CMAKE_BUILD_WITH_FLTO) cmake_policy(SET CMP0069 NEW) set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) @@ -76,49 +79,91 @@ endif() string(TIMESTAMP _BUILD_TIMESTAMP) ## Generate a version.h file containing build version and timestamp -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/mSWEEP_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_version.h @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/mSWEEP_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_version.h @ONLY) ## Configure OpenMP if it supported on the system. -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/mSWEEP_openmp_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_openmp_config.hpp @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/mSWEEP_openmp_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_openmp_config.hpp @ONLY) ## Configure MPI if it's supported on the system. -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/mSWEEP_mpi_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_mpi_config.hpp @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/mSWEEP_mpi_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_mpi_config.hpp @ONLY) add_executable(mSWEEP ${CMAKE_CURRENT_SOURCE_DIR}/src/mSWEEP.cpp) ## Check supported compression types -find_package(BZip2) -if (BZIP2_FOUND) - include_directories(${BZIP2_INCLUDE_DIRS}) +## Dependencies +### Check supported compression types +#### zlib +if ((DEFINED ZLIB_LIBRARY AND DEFINED ZLIB_INCLUDE_DIR) AND (NOT DEFINED ZLIB_FOUND)) + message(STATUS "zlib library provided in: " ${ZLIB_LIBRARY}) + message(STATUS "zlib headers provided in: " ${ZLIB_INCLUDE_DIR}) + include_directories(${ZLIB_INCLUDE_DIR}) + target_link_libraries(mSWEEP ${ZLIB_LIBRARY}) + set(MSWEEP_HAVE_ZLIB 1) +else() + find_package(ZLIB) + if (ZLIB_FOUND) + include_directories(${ZLIB_INCLUDE_DIR}) + target_link_libraries(mSWEEP ${ZLIB_LIBRARY}) + set(MSWEEP_HAVE_ZLIB 1) + else() + set(MSWEEP_HAVE_ZLIB 0) + endif() +endif() + +#### bzip2 +if (DEFINED BZIP2_LIBRARIES AND DEFINED BZIP2_INCLUDE_DIR AND (NOT DEFINED BZIP2_FOUND)) + message(STATUS "bzip2 library provided in: " ${BZIP2_LIBRARIES}) + message(STATUS "bzip2 headers provided in: " ${BZIP2_INCLUDE_DIR}) + include_directories(${BZIP2_INCLUDE_DIR}) target_link_libraries(mSWEEP ${BZIP2_LIBRARIES}) -endif() -find_package(LibLZMA) -if (LIBLZMA_FOUND) - include_directories(${LIBLZMA_INCLUDE_DIRS}) - target_link_libraries(mSWEEP ${LIBLZMA_LIBRARIES}) + set(MSWEEP_HAVE_BZIP2 1) +else() + find_package(BZip2) + if (BZIP2_FOUND) + include_directories(${BZIP2_INCLUDE_DIR}) + target_link_libraries(mSWEEP ${BZIP2_LIBRARIES}) + set(MSWEEP_HAVE_BZIP2 1) + else() + set(MSWEEP_HAVE_BZIP2 0) + endif() endif() -find_package(ZLIB) -if (ZLIB_FOUND) - include_directories(${ZLIB_INCLUDE_DIRS}) - target_link_libraries(mSWEEP ${ZLIB_LIBRARIES}) + +#### lzma +if (DEFINED LIBLZMA_LIBRARY AND DEFINED LIBLZMA_INCLUDE_DIR AND (NOT DEFINED LIBLZMA_FOUND)) + message(STATUS "liblzma library provided in: " ${LIBLZMA_LIBRARY}) + message(STATUS "liblzma headers provided in: " ${LIBLZMA_INCLUDE_DIR}) + include_directories(${LIBLZMA_INCLUDE_DIR}) + target_link_libraries(mSWEEP ${LIBLZMA_LIBRARY}) + set(MSWEEP_HAVE_LIBLZMA 1) +else() + find_package(LibLZMA) + if (LIBLZMA_FOUND) + include_directories(${LIBLZMA_INCLUDE_DIR}) + target_link_libraries(mSWEEP ${LIBLZMA_LIBRARY}) + set(MSWEEP_HAVE_LIBLZMA 1) + else() + set(MSWEEP_HAVE_LIBLZMA 0) + endif() endif() ## bxzstr if (DEFINED CMAKE_BXZSTR_HEADERS) message(STATUS "bxzstr headers provided in: ${CMAKE_BXZSTR_HEADERS}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-bxzstr.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr-download ) - if(result) - message(FATAL_ERROR "CMake step for bxzstr failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr-download ) - if(result) - message(FATAL_ERROR "Build step for bxzstr failed: ${result}") - endif() - set(CMAKE_BXZSTR_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr/include) + FetchContent_Declare(bxzstr + GIT_REPOSITORY https://github.com/tmaklin/bxzstr.git + GIT_TAG v1.1.0 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/bxzstr" + BUILD_IN_SOURCE 0 + CMAKE_ARGS -D ZSTD_FOUND=0 + -D ZLIB_FOUND=${MSWEEP_HAVE_ZLIB} + -D BZIP2_FOUND=${MSWEEP_HAVE_BZIP2} + -D LIBLZMA_FOUND=${MSWEEP_HAVE_LIBLZMA} + BUILD_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(bxzstr) + set(CMAKE_BXZSTR_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/bxzstr/include) endif() include_directories(${CMAKE_BXZSTR_HEADERS}) @@ -127,20 +172,18 @@ include_directories(${CMAKE_BXZSTR_HEADERS}) if (DEFINED CMAKE_CXXIO_HEADERS) message(STATUS "cxxio headers provided in: ${CMAKE_CXXIO_HEADERS}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-cxxio.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/cxxio-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/cxxio-download ) - if(result) - message(FATAL_ERROR "CMake step for cxxio failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/cxxio-download ) - if(result) - message(FATAL_ERROR "Build step for cxxio failed: ${result}") - endif() - set(CMAKE_CXXIO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/cxxio/include) + FetchContent_Declare(cxxio + GIT_REPOSITORY https://github.com/tmaklin/cxxio.git + GIT_TAG v0.1.0 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/cxxio" + BUILD_IN_SOURCE 0 + BUILD_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(cxxio) + set(CMAKE_CXXIO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/cxxio/include) endif() include_directories("${CMAKE_CXXIO_HEADERS}") @@ -148,158 +191,181 @@ include_directories("${CMAKE_CXXIO_HEADERS}") if (DEFINED CMAKE_CXXARGS_HEADERS) message(STATUS "cxxargs headers provided in: ${CMAKE_CXXARGS_HEADERS}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-cxxargs.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs-download ) - if(result) - message(FATAL_ERROR "CMake step for cxxargs failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs-download ) - if(result) - message(FATAL_ERROR "Build step for cxxargs failed: ${result}") - endif() - set(CMAKE_CXXARGS_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs/include) + FetchContent_Declare(cxxargs + GIT_REPOSITORY https://github.com/tmaklin/cxxargs.git + GIT_TAG v1.1.4 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/cxxargs" + BUILD_IN_SOURCE 0 + BUILD_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(cxxargs) + set(CMAKE_CXXARGS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/cxxargs/include) endif() include_directories("${CMAKE_CXXARGS_HEADERS}") ## alignment-writer -if (DEFINED CMAKE_ALIGNMENT_WRITER_HEADERS) +if (DEFINED CMAKE_ALIGNMENT_WRITER_HEADERS AND DEFINED CMAKE_ALIGNMENT_WRITER_LIBRARY) message(STATUS "alignment-writer headers provided in: ${CMAKE_ALIGNMENT_WRITER_HEADERS}") + message(STATUS "alignment-writer library provided in: ${CMAKE_ALIGNMENT_WRITER_LIBRARY}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-alignment-writer.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer-download ) - if(result) - message(FATAL_ERROR "CMake step for alignment-writer failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer-download ) - if(result) - message(FATAL_ERROR "Build step for alignment-writer failed: ${result}") - endif() - add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer - ${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer/build) + FetchContent_Declare(alignment-writer + GIT_REPOSITORY https://github.com/tmaklin/alignment-writer.git + GIT_TAG v0.5.0 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/alignment-writer" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer" + BUILD_IN_SOURCE 0 + CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} + -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} + -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -D "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(alignment-writer) + add_dependencies(mSWEEP libalignmentwriter) set_target_properties(alignment-writer PROPERTIES EXCLUDE_FROM_ALL 1) - set(CMAKE_ALIGNMENT_WRITER_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer/include) + set(CMAKE_ALIGNMENT_WRITER_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/alignment-writer/include) + set(CMAKE_ALIGNMENT_WRITER_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/lib/libalignment-writer.a) endif() include_directories(${CMAKE_ALIGNMENT_WRITER_HEADERS}) +target_link_libraries(mSWEEP ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) ## telescope if (DEFINED CMAKE_TELESCOPE_LIBRARY AND DEFINED CMAKE_TELESCOPE_HEADERS) - find_library(TELESCOPE NAMES telescope HINTS ${CMAKE_TELESCOPE_LIBRARY}) - target_link_libraries(mSWEEP ${TELESCOPE}) + message(STATUS "telescope headers provided in: ${CMAKE_TELESCOPE_HEADERS}") + message(STATUS "telescope library provided in: ${CMAKE_TELESCOPE_LIBRARY}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-telescope.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/telescope-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/telescope-download ) - if(result) - message(FATAL_ERROR "CMake step for telescope failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/telescope-download ) - if(result) - message(FATAL_ERROR "Build step for telescope failed: ${result}") - endif() - add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/telescope - ${CMAKE_CURRENT_BINARY_DIR}/external/telescope/build) + FetchContent_Declare(telescope + GIT_REPOSITORY https://github.com/tmaklin/telescope.git + GIT_TAG v0.7.0-prerelease + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/telescope" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/telescope" + BUILD_IN_SOURCE 0 + CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} + -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} + -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} + -D CMAKE_ALIGNMENT_WRITER_HEADERS=${CMAKE_ALIGNMENT_WRITER_HEADERS} + -D CMAKE_ALIGNMENT_WRITER_LIBRARY=${CMAKE_ALIGNMENT_WRITER_LIBRARY} + -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/external/BitMagic-7.12.3/src + -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -D "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(telescope) + add_dependencies(telescope libalignmentwriter) + add_dependencies(mSWEEP telescope) set_target_properties(telescope PROPERTIES EXCLUDE_FROM_ALL 1) - set(CMAKE_TELESCOPE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/telescope/include) - set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/telescope/external/BitMagic-7.12.3/src) - get_property(CMAKE_TELESCOPE_LIBRARY TARGET telescope PROPERTY LOCATION) - target_link_libraries(mSWEEP libtelescope libalignmentwriter) + set(CMAKE_TELESCOPE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/include) + set(CMAKE_TELESCOPE_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/lib/libtelescope.a) + set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/external/BitMagic-7.12.3/src) endif() include_directories(${CMAKE_TELESCOPE_HEADERS} ${CMAKE_BITMAGIC_HEADERS}) +target_link_libraries(mSWEEP ${CMAKE_TELESCOPE_LIBRARY}) ## seamat if (DEFINED CMAKE_SEAMAT_HEADERS) message(STATUS "seamat headers provided in: ${CMAKE_SEAMAT_HEADERS}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-seamat.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/seamat-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/seamat-download ) - if(result) - message(FATAL_ERROR "CMake step for seamat failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/seamat-download ) - if(result) - message(FATAL_ERROR "Build step for seamat failed: ${result}") - endif() - add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/seamat - ${CMAKE_CURRENT_BINARY_DIR}/external/seamat/build) - target_link_libraries(mSWEEP rcgomp) - set(CMAKE_SEAMAT_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/seamat/include) + FetchContent_Declare(seamat + GIT_REPOSITORY https://github.com/tmaklin/seamat.git + GIT_TAG v0.2.2 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/seamat" + BUILD_IN_SOURCE 0 + CMAKE_ARGS -D CMAKE_BUILD_TESTS=0 + BUILD_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(seamat) + set(CMAKE_SEAMAT_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/seamat/include ${CMAKE_CURRENT_BINARY_DIR}/_deps/seamat-build/include) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/mSWEEP_openmp_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_openmp_config.hpp @ONLY) endif() include_directories(${CMAKE_SEAMAT_HEADERS}) ## rcgpar -if (DEFINED CMAKE_RCGPAR_LIBRARY AND DEFINED CMAKE_RCGPAR_HEADERS) - find_library(RCGPAR NAMES rcgpar HINTS ${CMAKE_RCGPAR_LIBRARY}) - target_link_libraries(mSWEEP ${RCGPAR}) +if (DEFINED CMAKE_RCGPAR_LIBRARY AND DEFINED CMAKE_RCGPAR_HEADERS AND DEFINED CMAKE_RCGUTILS_LIBRARY) + message(STATUS "rcgpar headers provided in: ${CMAKE_RCGPAR_HEADERS}") + message(STATUS "rcgpar library provided in: ${CMAKE_RCGPAR_LIBRARY}") + message(STATUS "rcgutils library provided in: ${CMAKE_RCGUTILS_LIBRARY}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-rcgpar.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar-download ) - if(result) - message(FATAL_ERROR "CMake step for rcgpar failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar-download ) - if(result) - message(FATAL_ERROR "Build step for rcgpar failed: ${result}") - endif() - add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar - ${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar/build) - target_link_libraries(mSWEEP rcgomp) - set(CMAKE_RCGPAR_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar/include) + FetchContent_Declare(rcgpar + GIT_REPOSITORY https://github.com/tmaklin/rcgpar.git + GIT_TAG v1.1.3 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/rcgpar" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar" + BUILD_IN_SOURCE 0 + CMAKE_ARGS -D CMAKE_ENABLE_MPI_SUPPORT=${MSWEEP_MPI_SUPPORT} + -D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS} + -D CMAKE_BITMAGIC_HEADERS=${CMAKE_BITMAGIC_HEADERS} + -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -D "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(rcgpar) + add_dependencies(mSWEEP rcgomp rcgutils) + set(CMAKE_RCGPAR_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/rcgpar/include) + set(CMAKE_RCGPAR_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a") + set(CMAKE_RCGUTILS_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgutils.a") endif() +target_link_libraries(mSWEEP ${CMAKE_RCGPAR_LIBRARY} ${CMAKE_RCGUTILS_LIBRARY}) include_directories(${CMAKE_RCGPAR_HEADERS}) -## BitMagic -set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/telescope/external/BitMagic-7.12.3/src) -include_directories(${CMAKE_BITMAGIC_HEADERS}) - ## mGEMS if (DEFINED CMAKE_MGEMS_LIBRARY AND DEFINED CMAKE_MGEMS_HEADERS) - find_library(MGEMS NAMES mgems HINTS ${CMAKE_MGEMS_LIBRARY}) - target_link_libraries(mSWEEP ${MGEMS}) + message(STATUS "mGEMS headers provided in: ${CMAKE_MGEMS_HEADERS}") + message(STATUS "mGEMS library provided in: ${CMAKE_MGEMS_LIBRARY}") else() - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/CMakeLists-mGEMS.txt.in ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS-download/CMakeLists.txt) - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS-download ) - if(result) - message(FATAL_ERROR "CMake step for libmgems failed: ${result}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS-download ) - if(result) - message(FATAL_ERROR "Build step for libmgems failed: ${result}") - endif() - add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS - ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS/build) + FetchContent_Declare(mGEMS + GIT_REPOSITORY https://github.com/PROBIC/mGEMS.git + GIT_TAG v1.3.0 + PREFIX "external" + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/mGEMS" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS" + BUILD_IN_SOURCE 0 + CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} + -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} + -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} + -D CMAKE_ALIGNMENT_WRITER_HEADERS=${CMAKE_ALIGNMENT_WRITER_HEADERS} + -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_BINARY_DIR}/external/telescope/external/BitMagic-7.12.3/src + -D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS} + -D CMAKE_TELESCOPE_HEADERS=${CMAKE_TELESCOPE_HEADERS} + -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -D "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(mGEMS) + add_dependencies(mGEMS telescope libalignmentwriter) + add_dependencies(mSWEEP libmgems) set_target_properties(mGEMS PROPERTIES EXCLUDE_FROM_ALL 1) - target_link_libraries(mSWEEP libmgems) - set(CMAKE_MGEMS_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS/include) + set(CMAKE_MGEMS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/mGEMS/include) + set(CMAKE_MGEMS_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS/lib/libmgems.a) endif() +target_link_libraries(mSWEEP ${CMAKE_MGEMS_LIBRARY} ${CMAKE_TELESCOPE_LIBRARY} ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) include_directories(${CMAKE_MGEMS_HEADERS}) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include -${CMAKE_CURRENT_SOURCE_DIR}/external ${CMAKE_CURRENT_SOURCE_DIR}/include/tools -${CMAKE_CURRENT_SOURCE_DIR}/external/cxxio -${CMAKE_CURRENT_BINARY_DIR}/include) +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/external ${CMAKE_CURRENT_SOURCE_DIR}/include/tools + ${CMAKE_CURRENT_BINARY_DIR}/include + ) if(MPI_FOUND) add_dependencies(mSWEEP rcgmpi bigmpi) diff --git a/README.md b/README.md index 28c7832..c7ed8b1 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ i.e. the file format is automatically detected (alignment-writer v0.4.0 and newe We recommend running [demix\_check](https://github.com/tmaklin/coreutils_demix_check) on the binned reads and/or [checkm](https://github.com/Ecogenomics/CheckM) on the bin-assembled genomes (BAGs) to evaluate the accuracy of the results. ## Working with large alignment files +### Compressing Themisto output files For complex input data with many organisms, the pseudoalignment files from Themisto can get infeasibly large. In these cases, [alignment-writer](https://github.com/tmaklin/alignment-writer) can be used to compress the alignment files to <10% of the original size. mSWEEP >=v2.0.0 can read the compressed alignments in directly by running @@ -100,6 +101,32 @@ mSWEEP --themisto-1 fwd_compressed.aln --themisto-2 rev_compressed.aln -i cluste ``` +### Running estimation on large sparse alignments +If the target alignment is sparse, meaning that there are target groups which have few/no reads aligning against them in the whole sample, mSWEEP can be instructed to ignore these in the estimation by adding the `--min-hits 1` flag: +``` +mSWEEP --themisto sparse.aln -i clustering.txt -t 2 --min-hits 1 +``` +This will reduce the runtime and memory use of the estimation proportional to how many target groups are removed. Using `--min-hits 1` does not affect the results beyond differences in computational accuracy. + +The `--min-hits` flag also accepts values higher than 1 for pruning target groups with a small number of aligned reads. Using a value higher than 1 will change the resulting values. + +## (experimental) Reliability of abundance estimates +Add the `--run-rate` flag to calculate a relative reliability value for each abundance estimate using a variation of the [RATE method](https://doi.org/10.1214/18-AOAS1222) +``` +mSWEEP --themisto compressed.aln -i clustering.txt -t 2 --run-rate +``` +This will append the RATE and KLD columns to the output. RATE values that exceed `1/(number of lineages in clustering.txt)` are considered reliably estimated. + +If the reference contains many sequences that have zero or very few pseudoalignments, the denominator should be set to the number of lineages that have a nonzero abundance estimate instead of the total lineage count. + +A reliably estimated value means that an abundance estimate from +mSWEEP has a large effect on how well the statistical model in mSWEEP +fits to the input alignment data. This translates to a high value in +the KLD column and the RATE columns, which is derived from the KLD +values by dividing each value by the sum of all KLDs. + +__RATE as implemented in mSWEEP has not been tested thoroughly and is considered experimental.__ Consider using additional methods to verify the correctness of your results after filtering by RATE. + ## More options mSWEEP additionally accepts the following flags: @@ -153,6 +180,11 @@ Likelihood options: -q Mean for the beta-binomial component (default: 0.65). -e Dispersion term for the beta-binomial component (default: 0.01). --alphas Prior counts for the relative abundances, supply as comma-separated nonzero values (default: all 1.0). +--zero-inflation Likelihood of an observation that contains 0 pseudoalignments against a reference group (default: 0.01). + +Experimental options: +--run-rate Calculate relative reliability for each abundance estimate using RATE (default: false). +--min-hits Only consider target groups that have at least this many reads align to any sequence in them (default: 0). ``` # References diff --git a/config/CMakeLists-alignment-writer.txt.in b/config/CMakeLists-alignment-writer.txt.in deleted file mode 100644 index b42adbd..0000000 --- a/config/CMakeLists-alignment-writer.txt.in +++ /dev/null @@ -1,17 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(alignment-writer-get NONE) -include(ExternalProject) - -ExternalProject_Add(alignment-writer-download - GIT_REPOSITORY https://github.com/tmaklin/alignment-writer.git - GIT_TAG v0.4.0 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/alignment-writer" - BUILD_IN_SOURCE 0 - BUILD_COMMAND "" - CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} - -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-bxzstr.txt.in b/config/CMakeLists-bxzstr.txt.in deleted file mode 100644 index 3c5384f..0000000 --- a/config/CMakeLists-bxzstr.txt.in +++ /dev/null @@ -1,16 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(bxzstr-get NONE) -include(ExternalProject) - -ExternalProject_Add(bxzstr-download - GIT_REPOSITORY https://github.com/tmaklin/bxzstr.git - GIT_TAG v1.1.0 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/bxzstr" - BUILD_IN_SOURCE 0 - BUILD_COMMAND "" - CMAKE_ARGS -D ZSTD_FOUND=0 - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-cxxargs.txt.in b/config/CMakeLists-cxxargs.txt.in deleted file mode 100644 index fedfd51..0000000 --- a/config/CMakeLists-cxxargs.txt.in +++ /dev/null @@ -1,16 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(cxxargs-get NONE) -include(ExternalProject) - -ExternalProject_Add(cxxargs-download - GIT_REPOSITORY https://github.com/tmaklin/cxxargs.git - GIT_TAG v1.1.4 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/cxxargs" - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-cxxio.txt.in b/config/CMakeLists-cxxio.txt.in deleted file mode 100644 index b448f55..0000000 --- a/config/CMakeLists-cxxio.txt.in +++ /dev/null @@ -1,16 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(cxxio-get NONE) -include(ExternalProject) - -ExternalProject_Add(cxxio-download - GIT_REPOSITORY https://github.com/tmaklin/cxxio.git - GIT_TAG v0.1.0 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/cxxio" - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-mGEMS.txt.in b/config/CMakeLists-mGEMS.txt.in deleted file mode 100644 index da5b462..0000000 --- a/config/CMakeLists-mGEMS.txt.in +++ /dev/null @@ -1,22 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(mGEMS-get NONE) -include(ExternalProject) - -ExternalProject_Add(mGEMS-download - GIT_REPOSITORY https://github.com/PROBIC/mGEMS.git - GIT_TAG v1.3.0 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/mGEMS" - BUILD_IN_SOURCE 0 - BUILD_COMMAND "" - CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} - -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} - -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} - -D CMAKE_TELESCOPE_HEADERS=$CMAKE_TELESCOPE_HEADERS - -D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS} - -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_BINARY_DIR}/external/telescope/external/BitMagic-7.12.3/src - -D CMAKE_ALIGNMENT_WRITER_HEADERS=$CMAKE_ALIGNMENT_WRITER_HEADERS - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-rcgpar.txt.in b/config/CMakeLists-rcgpar.txt.in deleted file mode 100644 index 9fa41fa..0000000 --- a/config/CMakeLists-rcgpar.txt.in +++ /dev/null @@ -1,18 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(rcgpar-get NONE) -include(ExternalProject) - -ExternalProject_Add(rcgpar-download - GIT_REPOSITORY https://github.com/tmaklin/rcgpar - GIT_TAG v1.1.0 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar" - BUILD_IN_SOURCE 0 - BUILD_COMMAND "" - CMAKE_ARGS -D CMAKE_ENABLE_MPI_SUPPORT=${MSWEEP_MPI_SUPPORT} - -D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS} - -D CMAKE_BITMAGIC_HEADERS=${CMAKE_BITMAGIC_HEADERS} - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-seamat.txt.in b/config/CMakeLists-seamat.txt.in deleted file mode 100644 index 3e576a2..0000000 --- a/config/CMakeLists-seamat.txt.in +++ /dev/null @@ -1,16 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(seamat-get NONE) -include(ExternalProject) - -ExternalProject_Add(seamat-download - GIT_REPOSITORY https://github.com/tmaklin/seamat - GIT_TAG v0.2.1 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/seamat" - BUILD_IN_SOURCE 0 - BUILD_COMMAND "" - CMAKE_ARGS -D CMAKE_BUILD_TESTS=0 - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-telescope.txt.in b/config/CMakeLists-telescope.txt.in deleted file mode 100644 index b64ee3f..0000000 --- a/config/CMakeLists-telescope.txt.in +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(telescope-get NONE) -include(ExternalProject) - -ExternalProject_Add(telescope-download - GIT_REPOSITORY https://github.com/tmaklin/telescope.git - GIT_TAG v0.6.2 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/telescope" - BUILD_IN_SOURCE 0 - BUILD_COMMAND "" - CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} - -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} - -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} - -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_BINARY_DIR}/external/telescope/external/BitMagic-7.12.3/src - -D CMAKE_ALIGNMENT_WRITER_HEADERS=$CMAKE_ALIGNMENT_WRITER_HEADERS - INSTALL_COMMAND "" - TEST_COMMAND "" - UPDATE_COMMAND "" -) diff --git a/config/CMakeLists-zlib.txt.in b/config/CMakeLists-zlib.txt.in deleted file mode 100644 index 3cfd215..0000000 --- a/config/CMakeLists-zlib.txt.in +++ /dev/null @@ -1,15 +0,0 @@ -cmake_minimum_required(VERSION 2.8.2) - -project(zlib-get NONE) -include(ExternalProject) - -ExternalProject_Add(zlib-download - GIT_REPOSITORY https://github.com/madler/zlib.git - GIT_TAG v1.2.11 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/zlib" - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND ${CMAKE_BINARY_DIR}/external/zlib/configure --static - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) diff --git a/include/Likelihood.hpp b/include/Likelihood.hpp index 82153ff..73046bd 100644 --- a/include/Likelihood.hpp +++ b/include/Likelihood.hpp @@ -75,6 +75,8 @@ class Likelihood { // Get the ec counts virtual const std::vector& log_counts() const =0; + // Get vector indicating which groups were included + virtual const std::vector& groups_considered() const =0; }; template @@ -83,6 +85,9 @@ class LL_WOR21 : public Likelihood { seamat::DenseMatrix log_likelihoods; std::vector log_ec_counts; std::vector> bb_params; + std::vector groups_mask; + T zero_inflation; + T bb_constants[2]; seamat::DenseMatrix precalc_lls(const std::vector &group_sizes, const size_t n_groups) { V max_size = 0; // Storing the grouping can take a lot less space if it can be done with uint16_t or uint8_t. @@ -90,27 +95,54 @@ class LL_WOR21 : public Likelihood { max_size = (group_sizes[i] > max_size ? group_sizes[i] : max_size); } - seamat::DenseMatrix ll_mat(n_groups, max_size + 1, -4.60517); + seamat::DenseMatrix ll_mat(n_groups, max_size + 1, std::log(this->zero_inflation)); #pragma omp parallel for schedule(static) shared(ll_mat) for (size_t i = 0; i < n_groups; ++i) { for (V j = 1; j <= max_size; ++j) { - ll_mat(i, j) = ldbb_scaled(j, group_sizes[i], this->bb_params[i][0], this->bb_params[i][1]) - 0.01005034; // log(0.99) = -0.01005034 + ll_mat(i, j) = ldbb_scaled(j, group_sizes[i], this->bb_params[i][0], this->bb_params[i][1]) + std::log1p(-this->zero_inflation); } } return ll_mat; } - void fill_ll_mat(const telescope::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups) { + void fill_ll_mat(const telescope::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups, const size_t min_hits) { size_t num_ecs = alignment.n_ecs(); - const seamat::DenseMatrix &precalc_lls_mat = this->precalc_lls(group_sizes, n_groups); + bool mask_groups = min_hits > 0; + this->groups_mask = std::vector(n_groups, !mask_groups); + std::vector masked_group_sizes; + if (mask_groups) { + std::vector group_hit_counts(n_groups, (size_t)0); + // Create mask identifying groups that have at least 1 alignment + for (size_t i = 0; i < num_ecs; ++i) { + for (size_t j = 0; j < n_groups; ++j) { + group_hit_counts[j] += (alignment(j, i) > 0) * alignment.reads_in_ec(i); + } + } + for (size_t i = 0; i < n_groups; ++i) { + this->groups_mask[i] = groups_mask[i] || (group_hit_counts[i] >= min_hits); + if (this->groups_mask[i]) { + masked_group_sizes.push_back(group_sizes[i]); + } + } + } else { + masked_group_sizes = group_sizes; + } + size_t n_masked_groups = masked_group_sizes.size(); + + this->update_bb_parameters(masked_group_sizes, n_masked_groups, this->bb_constants); + const seamat::DenseMatrix &precalc_lls_mat = this->precalc_lls(masked_group_sizes, n_masked_groups); - this->log_likelihoods.resize(n_groups, num_ecs, -4.60517); // -4.60517 = log(0.01) + this->log_likelihoods.resize(n_masked_groups, num_ecs, std::log(this->zero_inflation)); #pragma omp parallel for schedule(static) shared(precalc_lls_mat) for (size_t j = 0; j < num_ecs; ++j) { + size_t groups_pos = 0; for (size_t i = 0; i < n_groups; ++i) { - this->log_likelihoods(i, j) = precalc_lls_mat(i, alignment(i, j)); + if (this->groups_mask[i]) { + this->log_likelihoods(groups_pos, j) = precalc_lls_mat(groups_pos, alignment(i, j)); + ++groups_pos; + } } } } @@ -139,14 +171,15 @@ class LL_WOR21 : public Likelihood { public: LL_WOR21() = default; - LL_WOR21(const std::vector &group_sizes, const telescope::Alignment &alignment, const size_t n_groups, const T tol, const T frac_mu) { - T bb_constants[2] = { tol, frac_mu }; - this->update_bb_parameters(group_sizes, n_groups, bb_constants); - this->from_grouped_alignment(alignment, group_sizes, n_groups); + LL_WOR21(const std::vector &group_sizes, const telescope::Alignment &alignment, const size_t n_groups, const T tol, const T frac_mu, const size_t min_hits, const T _zero_inflation) { + this->bb_constants[0] = tol; + this->bb_constants[1] = frac_mu; + this->zero_inflation = _zero_inflation; + this->from_grouped_alignment(alignment, group_sizes, n_groups, min_hits); } - void from_grouped_alignment(const telescope::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups) { - this->fill_ll_mat(alignment, group_sizes, n_groups); + void from_grouped_alignment(const telescope::Alignment &alignment, const std::vector &group_sizes, const size_t n_groups, const size_t min_hits) { + this->fill_ll_mat(alignment, group_sizes, n_groups, min_hits); this->fill_ec_counts(alignment); } @@ -256,51 +289,53 @@ class LL_WOR21 : public Likelihood { // Get the ec counts const std::vector& log_counts() const override { return this->log_ec_counts; }; + // Get the groups mask + const std::vector& groups_considered() const override { return this->groups_mask; }; }; template -std::unique_ptr> ConstructAdaptiveLikelihood(const telescope::Alignment &alignment, const Grouping &grouping, const T q, const T e) { +std::unique_ptr> ConstructAdaptiveLikelihood(const telescope::Alignment &alignment, const Grouping &grouping, const T q, const T e, const size_t min_hits, const T zero_inflation) { size_t max_group_size = grouping.max_group_size(); size_t n_groups = grouping.get_n_groups(); std::unique_ptr> log_likelihoods; if (max_group_size <= std::numeric_limits::max()) { if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } } else if (max_group_size <= std::numeric_limits::max()) { if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } } else if (max_group_size <= std::numeric_limits::max()) { if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } } else { if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else if (n_groups <= std::numeric_limits::max()) { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } else { - log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e)); + log_likelihoods.reset(new mSWEEP::LL_WOR21(static_cast*>(&grouping)->get_sizes(), alignment, n_groups, q, e, min_hits, zero_inflation)); } } return log_likelihoods; diff --git a/include/Sample.hpp b/include/Sample.hpp index faad5f1..8a3ce79 100644 --- a/include/Sample.hpp +++ b/include/Sample.hpp @@ -42,6 +42,10 @@ class Sample { size_t counts_total; seamat::DenseMatrix ec_probabilities; + // Relative abundance estimate scoring via RATEs + bool rate_run = false; + std::vector log_KLDs; + protected: void count_alignments(const telescope::Alignment &alignment); @@ -54,18 +58,32 @@ class Sample { virtual const std::vector& get_abundances() const =0; // Write the relative abundances virtual void write_abundances(const std::vector &group_names, std::ostream *of) const =0; + virtual void write_abundances2(const std::vector &estimated_group_names, + const std::vector &zero_group_names, std::ostream *of) const =0; // Non-virtuals // Store equivalence class probabilities void store_probs(const seamat::DenseMatrix &probs) { this->ec_probabilities = std::move(probs); } + // Write equivalence class probabilities void write_probs(const std::vector &cluster_indicators_to_string, std::ostream *of); + void write_probs2(const std::vector &cluster_indicators_to_string, + const std::vector &zero_indicators_to_string, std::ostream *of); + + // Calculate KLDs from probs + void dirichlet_kld(const std::vector &log_ec_hit_counts); // Getters size_t get_counts_total() const { return this->counts_total; }; size_t get_n_reads() const { return this->n_reads; }; + const seamat::DenseMatrix& get_probs() const { return this->ec_probabilities; } + const std::vector& get_log_klds() const { return this->log_KLDs; } + std::vector get_rates() const; + size_t get_n_ecs() const { return this->ec_probabilities.get_rows(); } + size_t get_n_refs() const { return this->ec_probabilities.get_cols(); } + size_t get_rate_run() const { return this->rate_run; } }; class Binning { @@ -102,6 +120,8 @@ class PlainSample : public Sample { // Write the relative abundances void write_abundances(const std::vector &group_names, std::ostream *of) const override; + void write_abundances2(const std::vector &estimated_group_names, + const std::vector &zero_group_names, std::ostream *of) const override; // Getters const std::vector& get_abundances() const override { return this->relative_abundances; } @@ -161,6 +181,8 @@ class BootstrapSample : public Sample { // Write the bootstrap results void write_abundances(const std::vector &group_names, std::ostream *os) const override; + void write_abundances2(const std::vector &estimated_group_names, + const std::vector &zero_group_names, std::ostream *of) const override; // Getters const std::vector& get_abundances() const override { return this->bootstrap_results[0]; } diff --git a/config/mSWEEP_mpi_config.hpp.in b/include/mSWEEP_mpi_config.hpp.in similarity index 100% rename from config/mSWEEP_mpi_config.hpp.in rename to include/mSWEEP_mpi_config.hpp.in diff --git a/config/mSWEEP_openmp_config.hpp.in b/include/mSWEEP_openmp_config.hpp.in similarity index 100% rename from config/mSWEEP_openmp_config.hpp.in rename to include/mSWEEP_openmp_config.hpp.in diff --git a/config/mSWEEP_version.h.in b/include/mSWEEP_version.h.in similarity index 100% rename from config/mSWEEP_version.h.in rename to include/mSWEEP_version.h.in diff --git a/src/BootstrapSample.cpp b/src/BootstrapSample.cpp index fb7f15e..694baf9 100644 --- a/src/BootstrapSample.cpp +++ b/src/BootstrapSample.cpp @@ -95,4 +95,37 @@ void BootstrapSample::write_abundances(const std::vector &group_nam } } +void BootstrapSample::write_abundances2(const std::vector &estimated_group_names, + const std::vector &zero_group_names, std::ostream *of) const { + // Write relative abundances to a file, + // outputs to std::cout if outfile is empty. + if (of->good()) { + (*of) << "#mSWEEP_version:" << '\t' << MSWEEP_BUILD_VERSION << '\n'; + (*of) << "#num_reads:" << '\t' << this->get_n_reads() << '\n'; + (*of) << "#num_aligned:" << '\t' << this->get_counts_total() << '\n'; + (*of) << "#bootstrap_iters:" << '\t' << this->iters << '\n'; + (*of) << "#c_id" << '\t' << "mean_theta" << '\t' << "bootstrap_mean_thetas" << '\n'; + + size_t n_targets = estimated_group_names.size() + zero_group_names.size(); + for (size_t i = 0; i < n_targets; ++i) { + if (i < estimated_group_names.size()) { + (*of) << estimated_group_names[i] << '\t'; + (*of) << this->bootstrap_results[0][i] << '\t'; // First vec has the relative abundances without bootstrapping + for (size_t j = 0; j < this->iters; ++j) { + (*of) << this->bootstrap_results[j + 1][i] << (j == this->iters - 1 ? '\n' : '\t'); + } + } else { + (*of) << zero_group_names[i - estimated_group_names.size()] << '\t'; + (*of) << (double)0.0 << '\t'; // First vec has the relative abundances without bootstrapping + for (size_t j = 0; j < this->iters; ++j) { + (*of) << (double)0.0 << (j == this->iters - 1 ? '\n' : '\t'); + } + + } + } + of->flush(); + } else { + throw std::runtime_error("Could not write to abundances file."); + } +} } diff --git a/src/PlainSample.cpp b/src/PlainSample.cpp index f0c064c..c297a79 100644 --- a/src/PlainSample.cpp +++ b/src/PlainSample.cpp @@ -45,4 +45,29 @@ void PlainSample::write_abundances(const std::vector &group_names, } } +void PlainSample::write_abundances2(const std::vector &estimated_group_names, + const std::vector &zero_group_names, std::ostream *of) const { + // Write relative abundances to &of, + if (of->good()) { + (*of) << "#mSWEEP_version:" << '\t' << MSWEEP_BUILD_VERSION << '\n'; + (*of) << "#num_reads:" << '\t' << this->get_n_reads() << '\n'; + (*of) << "#num_aligned:" << '\t' << this->get_counts_total() << '\n'; + (*of) << "#c_id" << '\t' << "mean_theta" << '\n'; + size_t n_targets = estimated_group_names.size() + zero_group_names.size(); + for (size_t i = 0; i < n_targets; ++i) { + if (i < estimated_group_names.size()) { + (*of) << estimated_group_names[i] << '\t'; + (*of) << this->relative_abundances[i]; + } else { + (*of) << zero_group_names[i - estimated_group_names.size()] << '\t'; + (*of) << (double)0.0; + } + (*of) << '\n'; + } + of->flush(); + } else { + throw std::runtime_error("Can't write to abundances file."); + } +} + } diff --git a/src/Sample.cpp b/src/Sample.cpp index 8e5b1d9..2fd4127 100644 --- a/src/Sample.cpp +++ b/src/Sample.cpp @@ -84,4 +84,101 @@ void Sample::write_probs(const std::vector &cluster_indicators_to_s } } +double digamma(double x) { + double result = 0, xx, xx2, xx4; + for ( ; x < 7; ++x) + result -= 1/x; + x -= 1.0/2.0; + xx = 1.0/x; + xx2 = xx*xx; + xx4 = xx2*xx2; + result += std::log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4; + return result; +} + +void Sample::dirichlet_kld(const std::vector &log_ec_hit_counts) { + size_t rows = this->get_probs().get_rows(); + size_t cols = this->get_probs().get_cols(); + + std::vector alphas(rows, 0.0); + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + size_t num_hits = std::round(std::exp(log_ec_hit_counts[j])); + for (size_t k = 0; k < num_hits; ++k) { + alphas[i] += std::exp(this->get_probs()(i, j)); + } + } + } + + double alpha0 = 0.0; + for (size_t i = 0; i < rows; ++i) { + alpha0 += alphas[i]; + } + + this->log_KLDs.resize(rows); + for (size_t i = 0; i < rows; ++i) { + double log_theta = std::log(alphas[i]) - std::log(alpha0); + double alpha_k = alphas[rows - 1]; + double alpha_j = alphas[i]; + double KLD = std::max(std::lgamma(alpha0) - std::lgamma(alpha0 - alpha_j) - std::lgamma(alpha_j) + alpha_j * (digamma(alpha_j) - digamma(alpha0)), 1e-16); + this->log_KLDs[i] = std::log(KLD); + } + + this->rate_run = true; +} + +std::vector Sample::get_rates() const { + double max_elem = 0.0; + // TODO pragma with custom reduction to find maximum + for (size_t i = 0; i < this->log_KLDs.size(); ++i) { + max_elem = (max_elem > this->log_KLDs[i] ? max_elem : this->log_KLDs[i]); + } + double tmp_sum = 0.0; +#pragma omp parallel for schedule(static) reduction(+:tmp_sum) + for (size_t i = 0; i < this->log_KLDs.size(); ++i) { + tmp_sum += std::exp(this->log_KLDs[i] - max_elem); + } + double log_KLDs_sum = std::log(tmp_sum) + max_elem; + + std::vector RATE(this->log_KLDs.size()); +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < this->log_KLDs.size(); ++i) { + RATE[i] = std::exp(this->log_KLDs[i] - log_KLDs_sum); + } + return RATE; +} + +void Sample::write_probs2(const std::vector &estimated_indicators_to_string, + const std::vector &zero_indicators_to_string, std::ostream *of) { + // Write the probability matrix to a file. + if (of->good()) { + *of << "ec_id" << '\t'; + size_t n_rows = estimated_indicators_to_string.size() + zero_indicators_to_string.size(); + size_t n_cols = this->ec_probabilities.get_cols(); + for (size_t i = 0; i < n_rows; ++i) { + if (i < estimated_indicators_to_string.size()) { + *of << estimated_indicators_to_string[i]; + *of << (i < n_rows - 1 ? '\t' : '\n'); + } else { + *of << zero_indicators_to_string[i - estimated_indicators_to_string.size()]; + *of << (i < n_rows - 1 ? '\t' : '\n'); + } + } + for (size_t i = 0; i < n_cols; ++i) { + *of << i << '\t'; + for (size_t j = 0; j < n_rows; ++j) { + if (j < estimated_indicators_to_string.size()) { + *of << std::exp(this->ec_probabilities(j, i)); + } else { + *of << (double)0.0; + } + *of << (j < n_rows - 1 ? '\t' : '\n'); + } + } + *of << std::endl; + of->flush(); + } else { + throw std::runtime_error("Can't write to probs file."); + } +} } diff --git a/src/mSWEEP.cpp b/src/mSWEEP.cpp index b43323e..04d666c 100644 --- a/src/mSWEEP.cpp +++ b/src/mSWEEP.cpp @@ -137,8 +137,12 @@ void parse_args(int argc, char* argv[], cxxargs::Arguments &args) { args.add_short_argument('e', "Dispersion term for the beta-binomial component (default: 0.01).", 0.01); // Prior parameters for estimation args.add_long_argument>("alphas", "Prior counts for the relative abundances, supply as comma-separated nonzero values (default: all 1.0)."); + args.add_long_argument("zero-inflation", "Likelihood of an observation that contains 0 pseudoalignments against a reference group (default: 0.01).\n\nExperimental options:", 0.01); args.set_not_required("alphas"); + args.add_long_argument("run-rate", "Calculate relative reliability for each abundance estimate using RATE (default: false).", false); + args.add_long_argument("min-hits", "Only consider target groups that have at least this many reads align to any sequence in them (default: 0).", (size_t)0); + if (CmdOptionPresent(argv, argv+argc, "--help")) { // Print help message and continue. std::cerr << "\n" + args.help() << '\n' << '\n'; @@ -363,7 +367,7 @@ int main (int argc, char *argv[]) { // Use the alignment data to populate the log_likelihoods matrix. try { - log_likelihoods = mSWEEP::ConstructAdaptiveLikelihood(*alignment, reference->get_grouping(i), args.value('q'), args.value('e')); + log_likelihoods = mSWEEP::ConstructAdaptiveLikelihood(*alignment, reference->get_grouping(i), args.value('q'), args.value('e'), args.value("min-hits"), args.value("zero-inflation")); } catch (std::exception &e) { finalize("Building the log-likelihood array failed:\n " + std::string(e.what()) + "\nexiting\n", log, true); return 1; @@ -400,6 +404,8 @@ int main (int argc, char *argv[]) { return 1; } + std::vector estimated_reference_names; + std::vector zero_reference_names; // Start the abundance estimation part if (args.value("no-fit-model")) { log << "Skipping relative abundance estimation (--no-fit-model toggled)" << '\n'; @@ -407,9 +413,9 @@ int main (int argc, char *argv[]) { log << "Estimating relative abundances" << '\n'; // Prior parameters - std::vector prior_counts(n_groups, 1.0); // Default is all = 1.0 + std::vector prior_counts(log_likelihoods->log_mat().get_rows(), 1.0); // Default is all = 1.0 if (CmdOptionPresent(argv, argv+argc, "--alphas")) { - if (args.value>("alphas").size() != n_groups) { + if (args.value>("alphas").size() != log_likelihoods->log_mat().get_rows()) { finalize("Error: --alphas must have the same number of values as there are groups.", log, true); return 1; } @@ -424,30 +430,50 @@ int main (int argc, char *argv[]) { return 1; } + if (args.value("run-rate")) { + std::cerr << "WARNING: --run-rate is an experimental option that has not been thoroughly tested and is subject to change.\n" << std::endl; + sample->dirichlet_kld(log_likelihoods->log_counts()); + } + + if (args.value("min-hits") > 0) { + std::cerr << "WARNING: --min-hits > 0 is an experimental option that has not been thoroughly tested and is subject to change.\n" << std::endl; + } + // Run binning if requested and write results to files. if (rank == 0) { // root performs the rest. // Turn the probs into relative abundances sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), log_likelihoods->log_counts())); + if (args.value("min-hits") > 0) { + for (size_t j = 0; j < reference->group_names(i).size(); ++j) { + if (log_likelihoods->groups_considered()[j]) { + estimated_reference_names.push_back(reference->group_names(i)[j]); + } else { + zero_reference_names.push_back(reference->group_names(i)[j]); + } + } + } else { + estimated_reference_names = reference->group_names(i); + } // Bin the reads if requested if (bin_reads) { std::vector target_names; if (CmdOptionPresent(argv, argv+argc, "--target-groups")) { target_names = std::move(args.value>("target-groups")); } else { - target_names = reference->group_names(i); + target_names = estimated_reference_names; } if (CmdOptionPresent(argv, argv+argc, "--min-abundance")) { - mGEMS::FilterTargetGroups(reference->group_names(i), sample->get_abundances(), args.value("min-abundance"), &target_names); + mGEMS::FilterTargetGroups(estimated_reference_names, sample->get_abundances(), args.value("min-abundance"), &target_names); } std::vector> bins; try { if (bootstrap_mode) { mSWEEP::BinningBootstrap* bs = static_cast(&(*sample)); - bins = std::move(mGEMS::BinFromMatrix(bs->get_aligned_reads(), sample->get_abundances(), sample->get_probs(), reference->group_names(i), &target_names)); + bins = std::move(mGEMS::BinFromMatrix(bs->get_aligned_reads(), sample->get_abundances(), sample->get_probs(), estimated_reference_names, &target_names)); } else { mSWEEP::BinningSample* bs = static_cast(&(*sample)); - bins = std::move(mGEMS::BinFromMatrix(bs->get_aligned_reads(), sample->get_abundances(), sample->get_probs(), reference->group_names(i), &target_names)); + bins = std::move(mGEMS::BinFromMatrix(bs->get_aligned_reads(), sample->get_abundances(), sample->get_probs(), estimated_reference_names, &target_names)); } } catch (std::exception &e) { finalize("Binning the reads failed:\n " + std::string(e.what()) + "\nexiting\n", log, true); @@ -470,10 +496,18 @@ int main (int argc, char *argv[]) { // Note: this ignores the printing_output variable because // we might want to print the probs even when writing to // pipe them somewhere. - sample->write_probs(reference->group_names(i), &std::cout); + if (args.value("min-hits") > 0) { + sample->write_probs2(estimated_reference_names, zero_reference_names, &std::cout); + } else { + sample->write_probs(estimated_reference_names, &std::cout); + } } if (args.value("write-probs")) { - sample->write_probs(reference->group_names(i), out.probs()); + if (args.value("min-hits") > 0) { + sample->write_probs2(estimated_reference_names, zero_reference_names, out.probs()); + } else { + sample->write_probs(estimated_reference_names, out.probs()); + } } } catch (std::exception &e) { finalize("Writing the probabilities failed:\n " + std::string(e.what()) + "\nexiting\n", log, true); @@ -508,7 +542,38 @@ int main (int argc, char *argv[]) { // Write relative abundances if (rank == 0 && !args.value("no-fit-model")) { try { - sample->write_abundances(reference->group_names(i), out.abundances()); + if (sample->get_rate_run()) { + const std::vector &log_kld = sample->get_log_klds(); + const std::vector &RATE = sample->get_rates(); + const std::vector &relative_abundances = sample->get_abundances(); + + std::ostream *of = out.abundances(); + // Write relative abundances to &of, + if (of->good()) { + (*of) << "#mSWEEP_version:" << '\t' << MSWEEP_BUILD_VERSION << '\n'; + (*of) << "#num_reads:" << '\t' << sample->get_n_reads() << '\n'; + (*of) << "#num_aligned:" << '\t' << sample->get_counts_total() << '\n'; + (*of) << "#c_id" << '\t' << "mean_theta" << '\t' << "RATE" << '\t' << "KLD" << '\n'; + size_t n_targets = estimated_reference_names.size() + zero_reference_names.size(); + for (size_t i = 0; i < n_targets; ++i) { + if (i < estimated_reference_names.size()) { + double KLD = std::exp(log_kld[i]); + (*of) << estimated_reference_names[i] << '\t' << relative_abundances[i] << '\t' << RATE[i] << '\t' << KLD <<'\n'; + } else { + (*of) << zero_reference_names[i - estimated_reference_names.size()] << '\t' << (double)0.0 << '\t' << (double)0.0 << '\t' << (double)0.0 <<'\n'; + } + } + of->flush(); + } else { + throw std::runtime_error("Can't write to abundances file."); + } + } else { + if (args.value("min-hits") > 0) { + sample->write_abundances2(estimated_reference_names, zero_reference_names, out.abundances()); + } else { + sample->write_abundances(estimated_reference_names, out.abundances()); + } + } } catch (std::exception &e) { finalize("Writing the relative abundances failed:\n " + std::string(e.what()) + "\nexiting\n", log, true); return 1;