diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000000..a4d3b96eb34 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,23 @@ +# More info: https://editorconfig.org + +# top-most EditorConfig file +root = true + +# Use space and 4 indentation style everywhere. +# Also add a newline at the end of every file. +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +indent_size = 4 + +# YAML files use 2 space indentation for now. +[*.{yml,md}] +indent_style = space +indent_size = 2 + +# Makefiles require tabs +[Makefile] +indent_style = tab diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml new file mode 100644 index 00000000000..59935e07273 --- /dev/null +++ b/.github/workflows/osx.yml @@ -0,0 +1,35 @@ +name: OSX-build + +on: [push] + +jobs: + osx-clang-omp: + strategy: + fail-fast: false + matrix: + config: + - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"} + - {shared: "OFF", build_type: "Release", name: "omp/release/static"} + name: ${{ matrix.config.name }} + runs-on: [macos-latest] + + steps: + - uses: actions/checkout@v2 + - name: setup + run: brew install libomp + - name: info + run: | + g++ -v + cmake --version + - name: configure + run: | + mkdir build + cd build + cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} + make -j8 + ctest -j10 --output-on-failure + - name: install + run: | + cd build + make install + make test_install diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml new file mode 100644 index 00000000000..66a81dda312 --- /dev/null +++ b/.github/workflows/windows-build.yml @@ -0,0 +1,158 @@ +name: Windows-build + +on: [push] + +jobs: + windows_cuda: + name: cuda102/release/shared (only compile) + runs-on: [windows-latest] + steps: + - uses: actions/checkout@v2 + - name: setup + run: | + choco install cuda --version=10.2.89.20191206 -y + - name: configure + run: | + $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.." + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + mkdir build + cd build + $env:PATH="$pwd\windows_shared_library;$env:PATH" + cmake -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF .. + cmake --build . -j4 --config Release + + windows_ref: + strategy: + fail-fast: false + matrix: + config: + - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} + - {shared: "OFF", build_type: "Release", name: "reference/release/static"} + # Debug static needs too much storage + # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} + name: msvc/${{ matrix.config.name }} + runs-on: [windows-latest] + steps: + - uses: actions/checkout@v2 + - name: shared_env + if: matrix.config.shared == 'ON' + run: | + echo "::set-env name=origin_path::$env:PATH" + echo "::add-path::$pwd\build\windows_shared_library" + - name: debug_env + if: matrix.config.build_type == 'Debug' + run: | + echo "::set-env name=CXXFLAGS::/bigobj" + - name: configure + run: | + mkdir build + cd build + cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF .. + cmake --build . -j4 --config ${{ matrix.config.build_type }} + ctest . -C ${{ matrix.config.build_type }} --output-on-failure + - name: install_shared_env + if: matrix.config.shared == 'ON' + run: | + echo "::set-env name=PATH::C:\Program Files (x86)\Ginkgo\lib;$env:origin_path" + - name: install + run: | + cd build + cmake --install . --config ${{ matrix.config.build_type }} + cmake --build . --target test_install --config ${{ matrix.config.build_type }} + windows_mingw: + strategy: + fail-fast: false + matrix: + config: + - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", cflags: "-O1"} + - {shared: "OFF", build_type: "Release", name: "omp/release/static", cflags: ""} + name: mingw/${{ matrix.config.name }} + runs-on: [windows-latest] + steps: + - uses: actions/checkout@v2 + - name: shared_env + if: matrix.config.shared == 'ON' + run: | + echo "::set-env name=origin_path::$env:PATH" + echo "::add-path::$pwd\build\windows_shared_library" + - name: debug_env + if: matrix.config.build_type == 'Debug' + run: | + echo "::set-env name=CXXFLAGS::-Wa,-mbig-obj" + - name: configure + # Use cmd to remove the path easily + run: | + set PATH=%PATH:C:\Program Files\Git\bin;=% + set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + bcdedit /set IncreaseUserVa 3072 + editbin /LARGEADDRESSAWARE "C:\Program Files\Git\mingw64\bin\cc1plus.exe" + mkdir build + cd build + cmake -G "MinGW Makefiles" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .. + cmake --build . -j4 + ctest . --output-on-failure + shell: cmd + - name: install_shared_env + if: matrix.config.shared == 'ON' + run: | + echo "::set-env name=PATH::C:\Program Files (x86)\Ginkgo\lib;$env:origin_path" + - name: install + run: | + set PATH=%PATH:C:\Program Files\Git\bin;=% + set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + cd build + cmake --install . + cmake --build . --target test_install + shell: cmd + + windows_cygwin: + strategy: + fail-fast: false + matrix: + config: + - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", cflags: "-O1"} + - {shared: "OFF", build_type: "Release", name: "omp/release/static", cflags: ""} + name: cygwin/${{ matrix.config.name }} + runs-on: [windows-latest] + steps: + - run: git config --global core.autocrlf input + - uses: actions/checkout@v2 + - name: setup + run: | + choco install cygwin -y + choco install cyg-get -y + cyg-get cmake make gcc-g++ git + - name: shared_static_env + run: | + echo "::set-env name=shared_ON_path::;$pwd\build\windows_shared_library" + echo "::set-env name=shared_OFF_path::" + - name: debug_env + if: matrix.config.build_type == 'Debug' + run: | + echo "::set-env name=CXXFLAGS::-Wa,-mbig-obj" + - name: configure + run: | + path C:\tools\cygwin\bin%shared_${{ matrix.config.shared }}_path% + mkdir build + cd build + bash -c "cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .." + bash -c "make -j4" + bash -c "ctest . --output-on-failure" + shell: cmd + - name: install_shared + if: matrix.config.shared == 'ON' + run: | + path C:\tools\cygwin\bin + cd build + bash -c "make install" + bash -c "export PATH=/usr/local/lib:$PATH && make test_install" + shell: cmd + - name: install_static + if: matrix.config.shared == 'OFF' + run: | + path C:\tools\cygwin\bin + cd build + bash -c "make install" + bash -c "make test_install" + shell: cmd diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index eaf638e04d3..008e88c45ed 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,3 @@ -image: localhost:5000/gko-cuda100-gnu7-llvm60 - stages: - sync - build @@ -15,22 +13,24 @@ stages: # Templates with reasonable defaults for builds and tests .variables_template: &default_variables BENCHMARK_SERVER: "FINECI" - C_COMPILER: gcc - CXX_COMPILER: g++ - BUILD_TYPE: Debug + C_COMPILER: "gcc" + CXX_COMPILER: "g++" + CUDA_COMPILER: "nvcc" + BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "ON" BUILD_REFERENCE: "ON" BUILD_OMP: "OFF" BUILD_CUDA: "OFF" + BUILD_HIP: "OFF" CXX_FLAGS: "" EXTRA_CMAKE_FLAGS: "" .before_script_template: &default_before_script - - export OMP_NUM_THREADS=4 + - export NUM_CORES=${CI_PARALLELISM} + - export OMP_NUM_THREADS=${NUM_CORES} - export CUDA_VISIBLE_DEVICES=0 .before_script_git_template: &git_before_script - # set up identities - eval $(ssh-agent -s) - echo "${BOT_KEY}" | tr -d '\r' | ssh-add - >/dev/null - mkdir -p ~/.ssh @@ -45,25 +45,57 @@ stages: before_script: *default_before_script script: - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} + - if [ -n "${CUDA_ARCH}" ]; then + CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; + CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER}); + fi - cmake ${CI_PROJECT_DIR} + -GNinja -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" - -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${EXTRA_CMAKE_FLAGS} + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} + ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} + -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON - - make -j$(grep "core id" /proc/cpuinfo | sort -u | wc -l) + - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} + dependencies: [] + except: + - schedules + +.build_template: &default_build_with_test + stage: build + variables: *default_variables + before_script: *default_before_script + script: + - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} + - if [ -n "${CUDA_ARCH}" ]; then + CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; + CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER}); + fi + - cmake ${CI_PROJECT_DIR} + -GNinja + -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} + ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} + -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} + -DGINKGO_BUILD_HIP=${BUILD_HIP} + -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON + - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install - | (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1 - ctest -V - - make install - - make test_install + - ninja test_install dependencies: [] except: - schedules sync: stage: sync + image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: GIT_STRATEGY: none PRIVATE_REPO: git@gitlab.com:ginkgo-project/ginkgo.git @@ -80,287 +112,444 @@ sync: - develop except: - schedules + tags: + - private_ci + - cpu # Build jobs -build/cuda90/gcc/cuda/debug/shared: - <<: *default_build +build/cuda90/gcc/all/debug/shared: + <<: *default_build_with_test image: localhost:5000/gko-cuda90-gnu5-llvm39 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug - EXTRA_CMAKE_FLAGS: &cuda_flags - "-DGINKGO_CUDA_ARCHITECTURES=35 -DCMAKE_CUDA_HOST_COMPILER=${CXX_COMPILER}" + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -build/cuda90/clang/cuda/release/static: - <<: *default_build +build/cuda90/clang/all/release/static: + <<: *default_build_with_test image: localhost:5000/gko-cuda90-gnu5-llvm39 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Release + BUILD_HIP: "ON" + BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" - EXTRA_CMAKE_FLAGS: *cuda_flags + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu # cuda 9.1 and friends -build/cuda91/gcc/cuda/debug/static: - <<: *default_build +build/cuda91/gcc/all/debug/static: + <<: *default_build_with_test image: localhost:5000/gko-cuda91-gnu6-llvm40 variables: <<: *default_variables + BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "OFF" - EXTRA_CMAKE_FLAGS: *cuda_flags + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -build/cuda91/clang/cuda/release/shared: - <<: *default_build +build/cuda91/clang/all/release/shared: + <<: *default_build_with_test image: localhost:5000/gko-cuda91-gnu6-llvm40 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Release - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_HIP: "ON" + BUILD_TYPE: "Release" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -build/cuda91/intel/cuda/debug/shared: - <<: *default_build - image: localhost:5000/gko-cuda91-gnu6-llvm40 - variables: - <<: *default_variables - C_COMPILER: icc - CXX_COMPILER: icpc - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: Debug - EXTRA_CMAKE_FLAGS: *cuda_flags - tags: - - cuda - - gpu # cuda 9.2 and friends -build/cuda92/gcc/cuda/release/shared: - <<: *default_build - image: localhost:5000/gko-cuda92-gnu7-llvm50 +build/cuda92/gcc/all/release/shared: + <<: *default_build_with_test + image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Release - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_HIP: "ON" + BUILD_TYPE: "Release" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -build/cuda92/clang/cuda/debug/static: - <<: *default_build - image: localhost:5000/gko-cuda92-gnu7-llvm50 +build/cuda92/clang/all/debug/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" + BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "OFF" - EXTRA_CMAKE_FLAGS: *cuda_flags + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu build/cuda92/intel/cuda/release/static: - <<: *default_build - image: localhost:5000/gko-cuda92-gnu7-llvm50 + <<: *default_build_with_test + image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017 variables: <<: *default_variables - C_COMPILER: icc - CXX_COMPILER: icpc + C_COMPILER: "icc" + CXX_COMPILER: "icpc" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Release + BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" - EXTRA_CMAKE_FLAGS: *cuda_flags + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu # cuda 10.0 and friends -build/cuda100/gcc/cuda/debug/shared: - <<: *default_build - image: localhost:5000/gko-cuda100-gnu7-llvm60 +build/cuda100/gcc/all/debug/shared: + <<: *default_build_with_test + image: localhost:5000/gko-cuda100-gnu7-llvm60-intel2018 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -build/cuda100/clang/cuda/release/static: - <<: *default_build - image: localhost:5000/gko-cuda100-gnu7-llvm60 +build/cuda100/clang/all/release/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda100-gnu7-llvm60-intel2018 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Release + BUILD_HIP: "ON" + BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" - EXTRA_CMAKE_FLAGS: *cuda_flags + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu build/cuda100/intel/cuda/release/shared: - <<: *default_build - image: localhost:5000/gko-cuda100-gnu7-llvm60 + <<: *default_build_with_test + image: localhost:5000/gko-cuda100-gnu7-llvm60-intel2018 variables: <<: *default_variables - C_COMPILER: icc - CXX_COMPILER: icpc + C_COMPILER: "icc" + CXX_COMPILER: "icpc" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Release - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_TYPE: "Release" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu # cuda 10.1 and friends -build/cuda101/gcc/cuda/debug/shared: - <<: *default_build - image: localhost:5000/gko-cuda101-gnu8-llvm70 +build/cuda101/gcc/all/debug/shared: + <<: *default_build_with_test + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -build/cuda101/clang/cuda/release/static: - <<: *default_build - image: localhost:5000/gko-cuda101-gnu8-llvm70 +build/cuda101/clang/all/release/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_HIP: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu build/cuda101/intel/cuda/debug/static: - <<: *default_build - image: localhost:5000/gko-cuda101-gnu8-llvm70 + <<: *default_build_with_test + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables - C_COMPILER: icc - CXX_COMPILER: icpc + C_COMPILER: "icc" + CXX_COMPILER: "icpc" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: Debug - EXTRA_CMAKE_FLAGS: *cuda_flags + BUILD_TYPE: "Debug" + BUILD_SHARED_LIBS: "OFF" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu +# clang-cuda with cuda 10.1 and friends +build/clang-cuda101/gcc/all/release/shared: + <<: *default_build_with_test + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 + variables: + <<: *default_variables + CUDA_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_HIP: "ON" + BUILD_TYPE: "Release" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +build/clang-cuda101/clang/cuda/debug/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 + variables: + <<: *default_variables + C_COMPILER: "clang" + CXX_COMPILER: "clang++" + CUDA_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_TYPE: "Debug" + BUILD_SHARED_LIBS: "OFF" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +# HIP AMD +build/amd/gcc/hip/debug/shared: + <<: *default_build_with_test + image: localhost:5000/gko-amd-gnu8-llvm7 + variables: + <<: *default_variables + BUILD_OMP: "ON" + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - amd + - gpu + +build/amd/clang/hip/release/static: + <<: *default_build_with_test + image: localhost:5000/gko-amd-gnu8-llvm7 + variables: + <<: *default_variables + C_COMPILER: "clang" + CXX_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_HIP: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - amd + - gpu + # no cuda but latest gcc and clang build/nocuda/gcc/core/debug/static: - <<: *default_build + <<: *default_build_with_test image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables BUILD_REFERENCE: "OFF" - BUILD_TYPE: Debug + BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "OFF" + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cpu build/nocuda/clang/core/release/shared: - <<: *default_build + <<: *default_build_with_test image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" BUILD_REFERENCE: "OFF" - BUILD_TYPE: Release + BUILD_TYPE: "Release" + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cpu build/nocuda/intel/core/debug/shared: - <<: *default_build - image: localhost:5000/gko-nocuda-gnu8-llvm70 + <<: *default_build_with_test + image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables - C_COMPILER: icc - CXX_COMPILER: icpc + C_COMPILER: "icc" + CXX_COMPILER: "icpc" BUILD_REFERENCE: "OFF" - BUILD_TYPE: Debug + BUILD_TYPE: "Debug" + only: + variables: + - $RUN_CI_TAG tags: + - private_ci + - cuda - cpu build/nocuda/gcc/omp/release/shared: - <<: *default_build + <<: *default_build_with_test image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables BUILD_OMP: "ON" - BUILD_TYPE: Release + BUILD_TYPE: "Release" + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cpu build/nocuda/clang/omp/debug/static: - <<: *default_build + <<: *default_build_with_test image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables - C_COMPILER: clang - CXX_COMPILER: clang++ + C_COMPILER: "clang" + CXX_COMPILER: "clang++" BUILD_OMP: "ON" - BUILD_TYPE: Debug + BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "OFF" + only: + variables: + - $RUN_CI_TAG tags: + - private_ci - cpu build/nocuda/intel/omp/release/static: - <<: *default_build - image: localhost:5000/gko-nocuda-gnu8-llvm70 + <<: *default_build_with_test + image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables - C_COMPILER: icc - CXX_COMPILER: icpc + C_COMPILER: "icc" + CXX_COMPILER: "icpc" BUILD_OMP: "ON" - BUILD_TYPE: Release + BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" + only: + variables: + - $RUN_CI_TAG tags: + - private_ci + - cuda - cpu @@ -368,15 +557,20 @@ build/nocuda/intel/omp/release/static: warnings: <<: *default_build stage: code_quality - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" + BUILD_HIP: "ON" CXX_FLAGS: "-Werror=pedantic -pedantic-errors" + only: + variables: + - $RUN_CI_TAG dependencies: [] allow_failure: yes tags: + - private_ci - cuda - gpu @@ -384,15 +578,20 @@ warnings: no-circular-deps: <<: *default_build stage: code_quality - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" + BUILD_HIP: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_CHECK_CIRCULAR_DEPS=on' + only: + variables: + - $RUN_CI_TAG dependencies: [] allow_failure: no tags: + - private_ci - cuda - gpu @@ -400,30 +599,40 @@ no-circular-deps: clang-tidy: <<: *default_build stage: code_quality - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" + BUILD_HIP: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_CLANG_TIDY=ON' + only: + variables: + - $RUN_CI_TAG dependencies: [] allow_failure: yes tags: + - private_ci - cuda - gpu iwyu: <<: *default_build stage: code_quality - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 variables: <<: *default_variables BUILD_OMP: "ON" BUILD_CUDA: "ON" + BUILD_CUDA: "HIP" EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON' + only: + variables: + - $RUN_CI_TAG dependencies: [] allow_failure: yes tags: + - private_ci - cuda - gpu @@ -431,7 +640,7 @@ iwyu: # For short living branches or PRs, try to detect an open PR sonarqube_cov_: stage: code_quality - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 before_script: *default_before_script script: - PR_ID=$(curl "https://api.github.com/search/issues?q=sha:${CI_COMMIT_SHA}" @@ -453,7 +662,7 @@ sonarqube_cov_: -Dsonar.cfamily.build-wrapper-output=build/bw-output -Dsonar.cfamily.gcov.reportsPath=build/Testing/CoverageInfo ${sonar_branching} -# - bash <(curl -s https://codecov.io/bash) -X gcov -X xcode -f "!*examples*" -f "!*third_party*" -f "!*c\\+\\+*" -f "!*benchmark*" + - bash <(curl -s https://codecov.io/bash) -f "\!*examples*" -f "\!*third_party*" -f "\!*c\\+\\+*" -f "\!*benchmark*" dependencies: [] except: refs: @@ -462,8 +671,9 @@ sonarqube_cov_: - tags only: variables: - - $PUBLIC_CI_TAG + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu @@ -471,7 +681,7 @@ sonarqube_cov_: # (the one that was merged). sonarqube_cov: stage: code_quality - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 before_script: *default_before_script script: - ctest -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=COVERAGE @@ -480,7 +690,7 @@ sonarqube_cov: -Dsonar.cfamily.build-wrapper-output=build/bw-output -Dsonar.cfamily.gcov.reportsPath=build/Testing/CoverageInfo -Dsonar.branch.name=${CI_COMMIT_REF_NAME} -# - bash <(curl -s https://codecov.io/bash) -X gcov -X xcode -f "!*test*" -f "!*examples*" -f "!*third_party*" -f "!*c\\+\\+*" -f "!*benchmark*" + - bash <(curl -s https://codecov.io/bash) -f "\!*examples*" -f "\!*third_party*" -f "\!*c\\+\\+*" -f "\!*benchmark*" dependencies: [] only: refs: @@ -488,8 +698,9 @@ sonarqube_cov: - master - tags variables: - - $PUBLIC_CI_TAG + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu @@ -507,10 +718,10 @@ gh-pages: - mkdir -p ${CI_JOB_NAME} && pushd ${CI_JOB_NAME} - cmake ${CI_PROJECT_DIR} -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DBUILD_SHARED_LIBS=ON - ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=OFF - -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF - -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF + -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF + -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON - make usr - make pdf @@ -534,18 +745,25 @@ gh-pages: - master - tags variables: - - $PUBLIC_CI_TAG + - $RUN_CI_TAG except: - schedules + tags: + - private_ci + - cpu threadsanitizer: stage: QoS_tools - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 before_script: *default_before_script script: - - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN - -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer + - LD_PRELOAD=/usr/local/lib/libomp.so + CC=clang CXX=clang++ + ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN + -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer + -DCTEST_MEMORYCHECK_SANITIZER_OPTIONS=ignore_noninstrumented_modules=1 + --timeout 6000 dependencies: [] only: refs: @@ -553,14 +771,35 @@ threadsanitizer: - develop - tags variables: - - $PUBLIC_CI_TAG + - $RUN_CI_TAG tags: + - private_ci + - cuda + - gpu + +leaksanitizer: + stage: QoS_tools + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 + before_script: *default_before_script + script: + - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=LSAN + -DCTEST_MEMORYCHECK_TYPE=LeakSanitizer + dependencies: [] + only: + refs: + - master + - develop + - tags + variables: + - $RUN_CI_TAG + tags: + - private_ci - cuda - gpu addresssanitizer: stage: QoS_tools - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 before_script: *default_before_script script: - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=ASAN @@ -572,17 +811,21 @@ addresssanitizer: - develop - tags variables: - - $PUBLIC_CI_TAG + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu -valgrind: +undefinedsanitizer: stage: QoS_tools - image: localhost:5000/gko-cuda101-gnu8-llvm70 + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 before_script: *default_before_script script: - - ctest -V -S cmake/CTestScript.cmake -DCTEST_MEMORYCHECK_TYPE=Valgrind + # the Gold linker is required because of a linker flag issues given by UBsan + # in the Ubuntu setup we are using. + - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=UBSAN + -DCTEST_MEMORYCHECK_TYPE=UndefinedBehaviorSanitizer dependencies: [] only: refs: @@ -590,8 +833,9 @@ valgrind: - develop - tags variables: - - $PUBLIC_CI_TAG + - $RUN_CI_TAG tags: + - private_ci - cuda - gpu @@ -617,10 +861,11 @@ valgrind: fineci-benchmark-build: stage: benchmark-build + image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables BENCHMARK_SERVER: FINECI - BUILD_TYPE: Release + BUILD_TYPE: "Release" BUILD_OMP: "ON" BUILD_CUDA: "ON" PUBLIC_REPO: https://github.com/ginkgo-project/ginkgo.git @@ -645,15 +890,20 @@ fineci-benchmark-build: -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} \\ -DGINKGO_BUILD_OMP=${BUILD_OMP} \\ -DGINKGO_BUILD_CUDA=${BUILD_CUDA} \\ + -DGINKGO_BUILD_HIP=${BUILD_HIP} \\ -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF \\ -DGINKGO_BUILD_BENCHMARKS=ON - make -j$(grep 'core id' /proc/cpuinfo | sort -u | wc -l) + make -j${CI_PARALLELISM} EOT dependencies: [] only: - schedules # - develop # - master + tags: + - private_ci + - cpu + - cuda # Benchmark runs @@ -692,6 +942,7 @@ fineci-benchmark-build: fineci-benchmark-cuda: stage: benchmark-cuda + image: localhost:5000/gko-nocuda-gnu9-llvm8 variables: <<: *default_variables BENCHMARK_SERVER: FINECI @@ -700,6 +951,10 @@ fineci-benchmark-cuda: BENCHMARK_REPO: git@github.com:ginkgo-project/ginkgo-data.git SYSTEM_NAME: K20Xm <<: *default_benchmark + tags: + - private_ci + - cpu + - cuda # fineci-benchmark-omp: # stage: benchmark-omp @@ -725,6 +980,7 @@ fineci-benchmark-cuda: new-issue-on-failure: stage: on-failure + image: localhost:5000/gko-nocuda-gnu9-llvm8 script: curl --request POST "https://gitlab.com/api/v4/projects/${PROJECT_ID}/issues?private_token=${BOT_ACCESS_TOKEN}&title=Error%20in%20${CI_PROJECT_NAME}%20with%20pipeline%20${CI_PIPELINE_ID}%20for%20commit%20${CI_COMMIT_SHA}&labels&description=${CI_PIPELINE_URL}" when: on_failure only: @@ -732,3 +988,6 @@ new-issue-on-failure: - develop - master dependencies: [] + tags: + - private_ci + - cpu diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a05209929..bba805119bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,150 @@ commits. For a comprehensive list, use the following command: git log --first-parent ``` +## Version 1.2.0 + +The Ginkgo team is proud to announce the new minor release of Ginkgo version +1.2.0. This release brings full HIP support to Ginkgo, new preconditioners +(ParILUT, ISAI), conversion between double and float for all LinOps, and many +more features and fixes. + +Supported systems and requirements: ++ For all platforms, cmake 3.9+ ++ Linux and MacOS + + gcc: 5.3+, 6.3+, 7.3+, all versions after 8.1+ + + clang: 3.9+ + + Intel compiler: 2017+ + + Apple LLVM: 8.0+ + + CUDA module: CUDA 9.0+ + + HIP module: ROCm 2.8+ ++ Windows + + MinGW and CygWin: gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+ + + Microsoft Visual Studio: VS 2017 15.7+ + + CUDA module: CUDA 9.0+, Microsoft Visual Studio + + OpenMP module: MinGW or CygWin. + + +The current known issues can be found in the [known issues page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues). + + +### Additions +Here are the main additions to the Ginkgo library. Other thematic additions are listed below. ++ Add full HIP support to Ginkgo [#344](https://github.com/ginkgo-project/ginkgo/pull/344), [#357](https://github.com/ginkgo-project/ginkgo/pull/357), [#384](https://github.com/ginkgo-project/ginkgo/pull/384), [#373](https://github.com/ginkgo-project/ginkgo/pull/373), [#391](https://github.com/ginkgo-project/ginkgo/pull/391), [#396](https://github.com/ginkgo-project/ginkgo/pull/396), [#395](https://github.com/ginkgo-project/ginkgo/pull/395), [#393](https://github.com/ginkgo-project/ginkgo/pull/393), [#404](https://github.com/ginkgo-project/ginkgo/pull/404), [#439](https://github.com/ginkgo-project/ginkgo/pull/439), [#443](https://github.com/ginkgo-project/ginkgo/pull/443), [#567](https://github.com/ginkgo-project/ginkgo/pull/567) ++ Add a new ISAI preconditioner [#489](https://github.com/ginkgo-project/ginkgo/pull/489), [#502](https://github.com/ginkgo-project/ginkgo/pull/502), [#512](https://github.com/ginkgo-project/ginkgo/pull/512), [#508](https://github.com/ginkgo-project/ginkgo/pull/508), [#520](https://github.com/ginkgo-project/ginkgo/pull/520) ++ Add support for ParILUT and ParICT factorization with ILU preconditioners [#400](https://github.com/ginkgo-project/ginkgo/pull/400) ++ Add a new BiCG solver [#438](https://github.com/ginkgo-project/ginkgo/pull/438) ++ Add a new permutation matrix format [#352](https://github.com/ginkgo-project/ginkgo/pull/352), [#469](https://github.com/ginkgo-project/ginkgo/pull/469) ++ Add CSR SpGEMM support [#386](https://github.com/ginkgo-project/ginkgo/pull/386), [#398](https://github.com/ginkgo-project/ginkgo/pull/398), [#418](https://github.com/ginkgo-project/ginkgo/pull/418), [#457](https://github.com/ginkgo-project/ginkgo/pull/457) ++ Add CSR SpGEAM support [#556](https://github.com/ginkgo-project/ginkgo/pull/556) ++ Make all solvers and preconditioners transposable [#535](https://github.com/ginkgo-project/ginkgo/pull/535) ++ Add CsrBuilder and CooBuilder for intrusive access to matrix arrays [#437](https://github.com/ginkgo-project/ginkgo/pull/437) ++ Add a standard-compliant allocator based on the Executors [#504](https://github.com/ginkgo-project/ginkgo/pull/504) ++ Support conversions for all LinOp between double and float [#521](https://github.com/ginkgo-project/ginkgo/pull/521) ++ Add a new boolean to the CUDA and HIP executors to control DeviceReset (default off) [#557](https://github.com/ginkgo-project/ginkgo/pull/557) ++ Add a relaxation factor to IR to represent Richardson Relaxation [#574](https://github.com/ginkgo-project/ginkgo/pull/574) ++ Add two new stopping criteria, for relative (to `norm(b)`) and absolute residual norm [#577](https://github.com/ginkgo-project/ginkgo/pull/577) + +#### Example additions ++ Templatize all examples to simplify changing the precision [#513](https://github.com/ginkgo-project/ginkgo/pull/513) ++ Add a new adaptive precision block-Jacobi example [#507](https://github.com/ginkgo-project/ginkgo/pull/507) ++ Add a new IR example [#522](https://github.com/ginkgo-project/ginkgo/pull/522) ++ Add a new Mixed Precision Iterative Refinement example [#525](https://github.com/ginkgo-project/ginkgo/pull/525) ++ Add a new example on iterative trisolves in ILU preconditioning [#526](https://github.com/ginkgo-project/ginkgo/pull/526), [#536](https://github.com/ginkgo-project/ginkgo/pull/536), [#550](https://github.com/ginkgo-project/ginkgo/pull/550) + +#### Compilation and library changes ++ Auto-detect compilation settings based on environment [#435](https://github.com/ginkgo-project/ginkgo/pull/435), [#537](https://github.com/ginkgo-project/ginkgo/pull/537) ++ Add SONAME to shared libraries [#524](https://github.com/ginkgo-project/ginkgo/pull/524) ++ Add clang-cuda support [#543](https://github.com/ginkgo-project/ginkgo/pull/543) + +#### Other additions ++ Add sorting, searching and merging kernels for GPUs [#403](https://github.com/ginkgo-project/ginkgo/pull/403), [#428](https://github.com/ginkgo-project/ginkgo/pull/428), [#417](https://github.com/ginkgo-project/ginkgo/pull/417), [#455](https://github.com/ginkgo-project/ginkgo/pull/455) ++ Add `gko::as` support for smart pointers [#493](https://github.com/ginkgo-project/ginkgo/pull/493) ++ Add setters and getters for criterion factories [#527](https://github.com/ginkgo-project/ginkgo/pull/527) ++ Add a new method to check whether a solver uses `x` as an initial guess [#531](https://github.com/ginkgo-project/ginkgo/pull/531) ++ Add contribution guidelines [#549](https://github.com/ginkgo-project/ginkgo/pull/549) + +### Fixes +#### Algorithms ++ Improve the classical CSR strategy's performance [#401](https://github.com/ginkgo-project/ginkgo/pull/401) ++ Improve the CSR automatical strategy [#407](https://github.com/ginkgo-project/ginkgo/pull/407), [#559](https://github.com/ginkgo-project/ginkgo/pull/559) ++ Memory, speed improvements to the ELL kernel [#411](https://github.com/ginkgo-project/ginkgo/pull/411) ++ Multiple improvements and fixes to ParILU [#419](https://github.com/ginkgo-project/ginkgo/pull/419), [#427](https://github.com/ginkgo-project/ginkgo/pull/427), [#429](https://github.com/ginkgo-project/ginkgo/pull/429), [#456](https://github.com/ginkgo-project/ginkgo/pull/456), [#544](https://github.com/ginkgo-project/ginkgo/pull/544) ++ Fix multiple issues with GMRES [#481](https://github.com/ginkgo-project/ginkgo/pull/481), [#523](https://github.com/ginkgo-project/ginkgo/pull/523), [#575](https://github.com/ginkgo-project/ginkgo/pull/575) ++ Optimize OpenMP matrix conversions [#505](https://github.com/ginkgo-project/ginkgo/pull/505) ++ Ensure the linearity of the ILU preconditioner [#506](https://github.com/ginkgo-project/ginkgo/pull/506) ++ Fix IR's use of the advanced apply [#522](https://github.com/ginkgo-project/ginkgo/pull/522) ++ Fix empty matrices conversions and add tests [#560](https://github.com/ginkgo-project/ginkgo/pull/560) + +#### Other core functionalities ++ Fix complex number support in our math header [#410](https://github.com/ginkgo-project/ginkgo/pull/410) ++ Fix CUDA compatibility of the main ginkgo header [#450](https://github.com/ginkgo-project/ginkgo/pull/450) ++ Fix isfinite issues [#465](https://github.com/ginkgo-project/ginkgo/pull/465) ++ Fix the Array::view memory leak and the array/view copy/move [#485](https://github.com/ginkgo-project/ginkgo/pull/485) ++ Fix typos preventing use of some interface functions [#496](https://github.com/ginkgo-project/ginkgo/pull/496) ++ Fix the `gko::dim` to abide to the C++ standard [#498](https://github.com/ginkgo-project/ginkgo/pull/498) ++ Simplify the executor copy interface [#516](https://github.com/ginkgo-project/ginkgo/pull/516) ++ Optimize intermediate storage for Composition [#540](https://github.com/ginkgo-project/ginkgo/pull/540) ++ Provide an initial guess for relevant Compositions [#561](https://github.com/ginkgo-project/ginkgo/pull/561) ++ Better management of nullptr as criterion [#562](https://github.com/ginkgo-project/ginkgo/pull/562) ++ Fix the norm calculations for complex support [#564](https://github.com/ginkgo-project/ginkgo/pull/564) + +#### CUDA and HIP specific ++ Use the return value of the atomic operations in our wrappers [#405](https://github.com/ginkgo-project/ginkgo/pull/405) ++ Improve the portability of warp lane masks [#422](https://github.com/ginkgo-project/ginkgo/pull/422) ++ Extract thread ID computation into a separate function [#464](https://github.com/ginkgo-project/ginkgo/pull/464) ++ Reorder kernel parameters for consistency [#474](https://github.com/ginkgo-project/ginkgo/pull/474) ++ Fix the use of `pragma unroll` in HIP [#492](https://github.com/ginkgo-project/ginkgo/pull/492) + +#### Other ++ Fix the Ginkgo CMake installation files [#414](https://github.com/ginkgo-project/ginkgo/pull/414), [#553](https://github.com/ginkgo-project/ginkgo/pull/553) ++ Fix the Windows compilation [#415](https://github.com/ginkgo-project/ginkgo/pull/415) ++ Always use demangled types in error messages [#434](https://github.com/ginkgo-project/ginkgo/pull/434), [#486](https://github.com/ginkgo-project/ginkgo/pull/486) ++ Add CUDA header dependency to appropriate tests [#452](https://github.com/ginkgo-project/ginkgo/pull/452) ++ Fix several sonarqube or compilation warnings [#453](https://github.com/ginkgo-project/ginkgo/pull/453), [#463](https://github.com/ginkgo-project/ginkgo/pull/463), [#532](https://github.com/ginkgo-project/ginkgo/pull/532), [#569](https://github.com/ginkgo-project/ginkgo/pull/569) ++ Add shuffle tests [#460](https://github.com/ginkgo-project/ginkgo/pull/460) ++ Fix MSVC C2398 error [#490](https://github.com/ginkgo-project/ginkgo/pull/490) ++ Fix missing interface tests in test install [#558](https://github.com/ginkgo-project/ginkgo/pull/558) + +### Tools and ecosystem +#### Benchmarks ++ Add better norm support in the benchmarks [#377](https://github.com/ginkgo-project/ginkgo/pull/377) ++ Add CUDA 10.1 generic SpMV support in benchmarks [#468](https://github.com/ginkgo-project/ginkgo/pull/468), [#473](https://github.com/ginkgo-project/ginkgo/pull/473) ++ Add sparse library ILU in benchmarks [#487](https://github.com/ginkgo-project/ginkgo/pull/487) ++ Add overhead benchmarking capacities [#501](https://github.com/ginkgo-project/ginkgo/pull/501) ++ Allow benchmarking from a matrix list file [#503](https://github.com/ginkgo-project/ginkgo/pull/503) ++ Fix benchmarking issue with JSON and non-finite numbers [#514](https://github.com/ginkgo-project/ginkgo/pull/514) ++ Fix benchmark logger crashers with OpenMP [#565](https://github.com/ginkgo-project/ginkgo/pull/565) + +#### CI related ++ Improvements to the CI setup with HIP compilation [#421](https://github.com/ginkgo-project/ginkgo/pull/421), [#466](https://github.com/ginkgo-project/ginkgo/pull/466) ++ Add MacOSX CI support [#470](https://github.com/ginkgo-project/ginkgo/pull/470), [#488](https://github.com/ginkgo-project/ginkgo/pull/488) ++ Add Windows CI support [#471](https://github.com/ginkgo-project/ginkgo/pull/471), [#488](https://github.com/ginkgo-project/ginkgo/pull/488), [#510](https://github.com/ginkgo-project/ginkgo/pull/510), [#566](https://github.com/ginkgo-project/ginkgo/pull/566) ++ Use sanitizers instead of valgrind [#476](https://github.com/ginkgo-project/ginkgo/pull/476) ++ Add automatic container generation and update facilities [#499](https://github.com/ginkgo-project/ginkgo/pull/499) ++ Fix the CI parallelism settings [#517](https://github.com/ginkgo-project/ginkgo/pull/517), [#538](https://github.com/ginkgo-project/ginkgo/pull/538), [#539](https://github.com/ginkgo-project/ginkgo/pull/539) ++ Make the codecov patch check informational [#519](https://github.com/ginkgo-project/ginkgo/pull/519) ++ Add support for LLVM sanitizers with improved thread sanitizer support [#578](https://github.com/ginkgo-project/ginkgo/pull/578) + +#### Test suite ++ Add an assertion for sparsity pattern equality [#416](https://github.com/ginkgo-project/ginkgo/pull/416) ++ Add core and reference multiprecision tests support [#448](https://github.com/ginkgo-project/ginkgo/pull/448) ++ Speed up GPU tests by avoiding device reset [#467](https://github.com/ginkgo-project/ginkgo/pull/467) ++ Change test matrix location string [#494](https://github.com/ginkgo-project/ginkgo/pull/494) + +#### Other ++ Add Ginkgo badges from our tools [#413](https://github.com/ginkgo-project/ginkgo/pull/413) ++ Update the `create_new_algorithm.sh` script [#420](https://github.com/ginkgo-project/ginkgo/pull/420) ++ Bump copyright and improve license management [#436](https://github.com/ginkgo-project/ginkgo/pull/436), [#433](https://github.com/ginkgo-project/ginkgo/pull/433) ++ Set clang-format minimum requirement [#441](https://github.com/ginkgo-project/ginkgo/pull/441), [#484](https://github.com/ginkgo-project/ginkgo/pull/484) ++ Update git-cmake-format [#446](https://github.com/ginkgo-project/ginkgo/pull/446), [#484](https://github.com/ginkgo-project/ginkgo/pull/484) ++ Disable the development tools by default [#442](https://github.com/ginkgo-project/ginkgo/pull/442) ++ Add a script for automatic header formatting [#447](https://github.com/ginkgo-project/ginkgo/pull/447) ++ Add GDB pretty printer for `gko::Array` [#509](https://github.com/ginkgo-project/ginkgo/pull/509) ++ Improve compilation speed [#533](https://github.com/ginkgo-project/ginkgo/pull/533) ++ Add editorconfig support [#546](https://github.com/ginkgo-project/ginkgo/pull/546) ++ Add a compile-time check for header self-sufficiency [#552](https://github.com/ginkgo-project/ginkgo/pull/552) + + ## Version 1.1.1 This version of Ginkgo provides a few fixes in Ginkgo's core routines. The supported systems and requirements are unchanged from version 1.1.0. diff --git a/CITING.md b/CITING.md new file mode 100644 index 00000000000..7f579d1a69f --- /dev/null +++ b/CITING.md @@ -0,0 +1,94 @@ +# Citing Ginkgo {#citing_ginkgo} + +The main Ginkgo paper describing Ginkgo's purpose, design and interface is +available through the following reference: + +``` bibtex +@misc{anzt2020ginkgo, + title={Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing}, + author={Hartwig Anzt and Terry Cojean and Goran Flegar and Fritz Göbel and Thomas Grützmacher and Pratik Nayak and Tobias Ribizel and Yuhsiang Mike Tsai and Enrique S. Quintana-Ortí}, + year={2020}, + eprint={2006.16852}, + archivePrefix={arXiv}, + primaryClass={cs.MS} +} +``` + +Multiple topical papers exist on Ginkgo and its algorithms. The following papers +can be used to cite specific aspects of the Ginkgo project. + +### On Portability + +``` bibtex +@misc{tsai2020amdportability, + title={Preparing Ginkgo for AMD GPUs -- A Testimonial on Porting CUDA Code to HIP}, + author={Yuhsiang M. Tsai and Terry Cojean and Tobias Ribizel and Hartwig Anzt}, + year={2020}, + eprint={2006.14290}, + archivePrefix={arXiv}, + primaryClass={cs.MS} +} +``` + +### On Software Sustainability + +``` bibtex +@inproceedings{anzt2019pasccb, +author = {Anzt, Hartwig and Chen, Yen-Chen and Cojean, Terry and Dongarra, Jack and Flegar, Goran and Nayak, Pratik and Quintana-Ort\'{\i}, Enrique S. and Tsai, Yuhsiang M. and Wang, Weichung}, +title = {Towards Continuous Benchmarking: An Automated Performance Evaluation Framework for High Performance Software}, +year = {2019}, +isbn = {9781450367707}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3324989.3325719}, +doi = {10.1145/3324989.3325719}, +booktitle = {Proceedings of the Platform for Advanced Scientific Computing Conference}, +articleno = {9}, +numpages = {11}, +keywords = {interactive performance visualization, healthy software lifecycle, continuous integration, automated performance benchmarking}, +location = {Zurich, Switzerland}, +series = {PASC ’19} +} +``` + +### On SpMV performance + +``` bibtex +@InProceedings{tsai2020amdspmv, +author="Tsai, Yuhsiang M. +and Cojean, Terry +and Anzt, Hartwig", +editor="Sadayappan, Ponnuswamy +and Chamberlain, Bradford L. +and Juckeland, Guido +and Ltaief, Hatem", +title="Sparse Linear Algebra on AMD and NVIDIA GPUs -- The Race Is On", +booktitle="High Performance Computing", +year="2020", +publisher="Springer International Publishing", +address="Cham", +pages="309--327", +abstract="Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD's hipSPARSE library and NVIDIA's cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.", +isbn="978-3-030-50743-5" +} + + +@article{anzt2020spmv, +author = {Anzt, Hartwig and Cojean, Terry and Yen-Chen, Chen and Dongarra, Jack and Flegar, Goran and Nayak, Pratik and Tomov, Stanimire and Tsai, Yuhsiang M. and Wang, Weichung}, +title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs}, +year = {2020}, +issue_date = {March 2020}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {7}, +number = {1}, +issn = {2329-4949}, +url = {https://doi.org/10.1145/3380930}, +doi = {10.1145/3380930}, +journal = {ACM Trans. Parallel Comput.}, +month = mar, +articleno = {2}, +numpages = {26}, +keywords = {irregular matrices, GPUs, Sparse Matrix Vector Product (SpMV)} +} +``` diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e9af2bdd07..5835b7a27a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,17 +1,23 @@ cmake_minimum_required(VERSION 3.9) -project(Ginkgo LANGUAGES C CXX VERSION 1.1.1 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") +project(Ginkgo LANGUAGES C CXX VERSION 1.2.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") set(Ginkgo_VERSION_TAG "master") set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) +# Determine which executors can be compiled +include(cmake/hip_path.cmake) +include(cmake/autodetect_executors.cmake) +include(cmake/build_type_helpers.cmake) + # Ginkgo configuration options -option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" ON) +option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF) option(GINKGO_BUILD_TESTS "Generate build files for unit tests" ON) option(GINKGO_BUILD_EXAMPLES "Build Ginkgo's examples" ON) option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON) option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON) -option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" OFF) -option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" OFF) +option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP}) +option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) +option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP}) option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) @@ -20,6 +26,9 @@ option(GINKGO_EXPORT_BUILD_DIR OFF) option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF) option(GINKGO_WITH_IWYU "Make Ginkgo call `iwyu` (Include What You Use) to find include issues." OFF) +option(GINKGO_CHECK_CIRCULAR_DEPS + "Enable compile-time checks detecting circular dependencies between libraries and non-self-sufficient headers." + OFF) set(GINKGO_VERBOSE_LEVEL "1" CACHE STRING "Verbosity level. Put 0 to turn off. 1 activates a few important messages.") if(MSVC) @@ -34,29 +43,47 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU achitectures. See README.md for more detail.") option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) +set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING + "Set the required HIP compiler flags. Current default is an empty string.") +set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING + "Set the required HIP nvcc compiler flags. Current default is an empty string.") +set(GINKGO_HIP_HCC_COMPILER_FLAGS "" CACHE STRING + "Set the required HIP HCC compiler flags. Current default is an empty string.") +set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING + "Set the required HIP CLANG compiler flags. Current default is an empty string.") +set(GINKGO_HIP_AMDGPU "" CACHE STRING + "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).") option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) +set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined") + if(BUILD_SHARED_LIBS AND (WIN32 OR CYGWIN) AND (GINKGO_BUILD_TESTS OR GINKGO_BUILD_EXAMPLES OR GINKGO_BUILD_BENCHMARKS)) # Change shared libraries output only if this build has executable program with shared libraries. set(GINKGO_CHANGED_SHARED_LIBRARY TRUE) option(GINKGO_CHECK_PATH "Tell Ginkgo to check if the environment variable PATH is available for this build." ON) set(GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH "windows_shared_library" CACHE STRING "Set Ginkgo's shared library relative path in windows. Current default is `windows_shared_library`. \ - This absoulte path ${PROJECT_BINARY_DIR}/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH must be in the environment variable PATH.") + This absolute path ${PROJECT_BINARY_DIR}/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH must be in the environment variable PATH.") set(GINKGO_WINDOWS_SHARED_LIBRARY_PATH ${PROJECT_BINARY_DIR}/${GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH}) else() set(GINKGO_CHANGED_SHARED_LIBRARY FALSE) endif() -if(GINKGO_BUILD_TESTS AND (GINKGO_BUILD_CUDA OR GINKGO_BUILD_OMP)) +if(GINKGO_BUILD_TESTS AND (GINKGO_BUILD_CUDA OR GINKGO_BUILD_OMP OR GINKGO_BUILD_HIP)) message(STATUS "GINKGO_BUILD_TESTS is ON, enabling GINKGO_BUILD_REFERENCE") set(GINKGO_BUILD_REFERENCE ON CACHE BOOL "Compile reference CPU kernels" FORCE) endif() if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting build type to 'Release' as none was specified.") - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting build type to 'Release' as none was specified.") + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) +endif() + +if(BUILD_SHARED_LIBS) + set(GINKGO_STATIC_OR_SHARED SHARED) +else() + set(GINKGO_STATIC_OR_SHARED STATIC) endif() # Ensure we have a debug postfix @@ -77,22 +104,15 @@ if(GINKGO_BUILD_TESTS) include(CTest) endif() -if (GINKGO_WITH_CLANG_TIDY) - find_program(GINKGO_CLANG_TIDY_PATH clang-tidy) +if(GINKGO_WITH_CLANG_TIDY) + find_program(GINKGO_CLANG_TIDY_PATH clang-tidy) endif() -if (GINKGO_WITH_IWYU) - find_program(GINKGO_IWYU_PATH iwyu) +if(GINKGO_WITH_IWYU) + find_program(GINKGO_IWYU_PATH iwyu) endif() - -# Load CMake helpers and modules -include(cmake/build_helpers.cmake) -include(cmake/build_type_helpers.cmake) -include(cmake/create_test.cmake) -include(cmake/install_helpers.cmake) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") - +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/") # Find important header files, store the definitions in include/ginkgo/config.h.in # For details, see https://gitlab.kitware.com/cmake/community/wikis/doc/tutorials/How-To-Write-Platform-Checks @@ -102,13 +122,41 @@ check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H) # Automatically find PAPI and search for the required 'sde' component set(GINKGO_HAVE_PAPI_SDE 0) find_package(PAPI OPTIONAL_COMPONENTS sde) -if (PAPI_sde_FOUND) +if(PAPI_sde_FOUND) set(GINKGO_HAVE_PAPI_SDE 1) endif() +set(GINKGO_HIP_PLATFORM_NVCC 0) +set(GINKGO_HIP_PLATFORM_HCC 0) + +if(GINKGO_BUILD_HIP) + # GINKGO_HIPCONFIG_PATH and HIP_PATH are set in cmake/hip_path.cmake + if(DEFINED ENV{HIP_PLATFORM}) + set(GINKGO_HIP_PLATFORM "$ENV{HIP_PLATFORM}") + elseif(GINKGO_HIPCONFIG_PATH) + execute_process(COMMAND ${GINKGO_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE GINKGO_HIP_PLATFORM) + else() + message(FATAL_ERROR "No platform could be found for HIP. " + "Set and export the environment variable HIP_PLATFORM.") + endif() + message(STATUS "HIP platform set to ${GINKGO_HIP_PLATFORM}") + + if (GINKGO_HIP_PLATFORM STREQUAL "hcc") + set(GINKGO_HIP_PLATFORM_HCC 1) + elseif (GINKGO_HIP_PLATFORM STREQUAL "nvcc") + set(GINKGO_HIP_PLATFORM_NVCC 1) + endif() +endif() + configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in ${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY) +# Load CMake helpers +include(cmake/build_helpers.cmake) +include(cmake/hip_helpers.cmake) +include(cmake/install_helpers.cmake) +include(cmake/windows_helpers.cmake) + # This is modified from https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace if(MSVC) if(BUILD_SHARED_LIBS) @@ -129,19 +177,25 @@ ginkgo_find_package(gflags gflags FALSE 2.2.2) ginkgo_find_package(RapidJSON rapidjson TRUE 1.1.0) add_subdirectory(third_party) # Third-party tools and libraries +# Ginkgo core libraries # Needs to be first in order for `CMAKE_CUDA_DEVICE_LINK_EXECUTABLE` to be # propagated to the other parts of Ginkgo in case of building as static libraries if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs endif() -# Ginkgo core libraries add_subdirectory(core) # Core Ginkgo types and top-level functions +add_subdirectory(include) # Public API self-contained check if (GINKGO_BUILD_REFERENCE) add_subdirectory(reference) # Reference kernel implementations endif() if (GINKGO_BUILD_OMP) add_subdirectory(omp) # High-performance omp kernels endif() +# HIP needs to be last because it builds the GINKGO_RPATH_FOR_HIP variable +# which needs to know the `ginkgo` target. +if(GINKGO_BUILD_HIP) + add_subdirectory(hip) # High-performance kernels for AMD or NVIDIA GPUs +endif() # Non core directories and targets if(GINKGO_BUILD_EXAMPLES) @@ -159,13 +213,33 @@ if(GINKGO_DEVEL_TOOLS) add_dependencies(format add_license) endif() -# Generate the global `ginkgo/ginkgo.hpp` header with every call of make -# when bash is present and the developer tools are enabled +# MacOS needs to install bash, gnu-sed, findutils and coreutils +# format_header needs clang-format 6.0.0+ find_program(BASH bash) if(NOT "${BASH}" STREQUAL "BASH-NOTFOUND" AND GINKGO_DEVEL_TOOLS) add_custom_target(generate_ginkgo_header ALL COMMAND ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/update_ginkgo_header.sh WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}) + find_program(GIT git) + if(NOT "${GIT}" STREQUAL "GIT-NOTFOUND") + add_custom_target(format_header + COMMAND echo "format header on the modified code files except build/examples/third_party/ginkgo.hpp" + COMMAND bash -c "git diff --name-only origin/master...HEAD | \ + grep -Ev 'build|examples|third_party|ginkgo.hpp' | \ + grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \ + xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh" + WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR} + VERBATIM) + endif() + unset(GIT CACHE) + add_custom_target(format_header_all + COMMAND echo "format header on all code files except build/examples/third_party/ginkgo.hpp" + COMMAND bash -c "find * -type f | \ + grep -Ev 'build|examples|third_party|ginkgo.hpp' | \ + grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \ + xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh" + WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR} + VERBATIM) endif() unset(BASH CACHE) @@ -182,15 +256,34 @@ endif() configure_file(${Ginkgo_SOURCE_DIR}/cmake/ginkgo.pc.in ${Ginkgo_BINARY_DIR}/ginkgo.pc @ONLY) +# WINDOWS NVCC has " inside the string, add escape charater to avoid config problem. +ginkgo_modify_flags(CMAKE_CUDA_FLAGS) +ginkgo_modify_flags(CMAKE_CUDA_FLAGS_DEBUG) +ginkgo_modify_flags(CMAKE_CUDA_FLAGS_RELEASE) ginkgo_install() +if(MSVC) + # Set path/command with $ + set(GINKGO_TEST_INSTALL_COMMAND "${Ginkgo_BINARY_DIR}/test_install/$/test_install") + if(GINKGO_BUILD_CUDA) + set(GINKGO_TEST_INSTALL_COMMAND "${GINKGO_TEST_INSTALL_COMMAND}" "${Ginkgo_BINARY_DIR}/test_install/$/test_install_cuda") + endif() +else() + set(GINKGO_TEST_INSTALL_COMMAND "${Ginkgo_BINARY_DIR}/test_install/test_install") + if(GINKGO_BUILD_CUDA) + set(GINKGO_TEST_INSTALL_COMMAND "${GINKGO_TEST_INSTALL_COMMAND}" "${Ginkgo_BINARY_DIR}/test_install/test_install_cuda") + endif() +endif() add_custom_target(test_install COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} -H${Ginkgo_SOURCE_DIR}/test_install - -B${Ginkgo_BINARY_DIR}/test_install - -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}/${GINKGO_INSTALL_CONFIG_DIR} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - COMMAND ${CMAKE_COMMAND} --build ${Ginkgo_BINARY_DIR}/test_install - COMMAND ${Ginkgo_BINARY_DIR}/test_install/test_install + -B${Ginkgo_BINARY_DIR}/test_install + -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}/${GINKGO_INSTALL_CONFIG_DIR} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} + # `--config cfg` is ignored by single-configuration generator. + # `$` is always be the same as `CMAKE_BUILD_TYPE` in single-configuration generator. + COMMAND ${CMAKE_COMMAND} --build ${Ginkgo_BINARY_DIR}/test_install --config $ + COMMAND ${GINKGO_TEST_INSTALL_COMMAND} COMMENT "Running a test on the installed binaries. This requires running `(sudo) make install` first.") # Setup CPack diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000000..9fcdc25ed13 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,610 @@ +# Contributing guidelines {#contributing_guidelines} + +We are glad that you are interested in contributing to Ginkgo. Please have a +look at our coding guidelines before proposing a pull request. + +## Table of Contents + +[Most Important stuff](#most-important-stuff-a-tldr) + +[Project Structure](#project-structure) + * [Extended header files](#extended-header-files) + * [Using library classes](#using-library-classes) + +[Git related](#git-related) + * [Our git Workflow](#our-git-workflow) + * [Writing good commit messages](#writing-good-commit-messages) + * [Creating, Reviewing and Merging Pull + Requests](#creating-reviewing-and-merging-pull-requests) + +[Code Style](#code-style) + * [Automatic code formatting](#automatic-code-formatting) + * [Naming Scheme](#naming-scheme) + * [Whitespace](#whitespace) + * [Include statement grouping](#include-statement-grouping) + * [Other Code Formatting not handled by + ClangFormat](#other-code-formatting-not-handled-by-clangformat) + * [CMake coding style](#cmake-coding-style) + +[Helper Scripts](#helper-scripts) + * [Create a new algorithm](#create-a-new-algorithm) + * [Converting CUDA code to HIP code](#converting-cuda-code-to-hip-code) + +[Writing Tests](#writing-tests) + * [Testing know-how](#testing-know-how) + * [Some general rules](#some-general-rules) + * [Writing tests for kernels](#writing-tests-for-kernels) + +[Documentation style](#documentation-style) + * [Developer targeted notes](#developer-targeted-notes) + * [Whitespaces](#whitespaces) + * [Documenting examples](#documenting-examples) + +[Other programming comments](#other-programming-comments) + * [C++ standard stream objects](#c-standard-stream-objects) + * [Warnings](#warnings) + * [Avoiding circular dependencies](#avoiding-circular-dependencies) + + +## Most important stuff (A TL;DR) + +* `GINKGO_DEVEL_TOOLS` needs to be set to `on` to commit. This requires + `clang-format` to be installed. See [Automatic code + formatting](#automatic-code-formatting) for more details. Once installed, you + can run `make format` in your `build/` folder to automatically format your + modified files. As `make format` unstages your files post-formatting, you must + stage the files again once you have verified that `make format` has done the + appropriate formatting, before committing the files. + +* See [Our git workflow](#our-git-workflow) to get a quick overview of our + workflow. + +* See [Creating, Reviewing and Merging Pull + Requests](#creating-reviewing-and-merging-pull-requests) on how to create a + Pull request. + + +## Project structure + +Ginkgo is divided into a `core` module with common functionalities independent +of the architecture, and several kernel modules (`reference`, `omp`, `cuda`, +`hip`) which contain low-level computational routines for each supported +architecture. + +### Extended header files + +Some header files from the core module have to be extended to include special +functionality for specific architectures. An example of this is +`core/base/math.hpp`, which has a GPU counterpart in `cuda/base/math.hpp`. For +such files you should always include the version from the module you are working +on, and this file will internally include its `core` counterpart. + +### Using library classes + +You can use and call functions of existing classes inside a kernel (that are +defined and not just declared in a header file), however, you are not allowed to +create new instances of a polymorphic class inside a kernel (or in general +inside any kernel module like cuda/hip/omp/reference) as this creates circular +dependencies between the `core` and the backend library. With this in mind, our +CI contains a job which checks if such a circular dependency exists. +These checks can be run manually using the `-DGINKGO_CHECK_CIRCULAR_DEPS=ON` +option in the CMake configuration. + +For example, when creating a new matrix class `AB` by combining existing classes +`A` and `B`, the `AB::apply()` function composed of invocations to `A::apply()` +and `B::apply()` can only be defined in the core module, it is not possible to +create instances of `A` and `B` inside the `AB` kernel files. This is to avoid +the aforementioned circular dependency issue. An example for such a class is the +`Hybrid` matrix format, which uses the `apply()` of the `Ell` and `Coo` matrix +formats. Nevertheless, it is possible to call the kernels themselves directly +within the same executor. For example, `cuda::dense::add_scaled()` can be called +from any other `cuda` kernel. + +## Git related + +Ginkgo uses git, the distributed version control system to track code changes +and coordinate work among its developers. A general guide to git can be found in +[its extensive documentation](https://git-scm.com/docs). + +### Our git workflow + +In Ginkgo, we prioritize keeping a clean history over accurate tracking of +commits. `git rebase` is hence our command of choice to make sure that we have a +nice and linear history, especially for pulling the latest changes from the +`develop` branch. More importantly, rebasing upon develop is **required** before +the commits of the PR are merged into the `develop` branch. + +### Writing good commit messages + +With software sustainability and maintainability in mind, it is important to +write commit messages that are short, clear and informative. Ideally, this would +be the format to prefer: + +```sh +Summary of the changes in a sentence, max 50 chars. + +More detailed comments: ++ Changes that have been added. +- Changes that been removed. + +Related PR: https://github.com/ginkgo-project/ginkgo/pull/ +``` + +You can refer to [this informative +guide](https://chris.beams.io/posts/git-commit/) for more details. + +#### Attributing credit + +Git has a nice feature where it allows you to add a co-author for your commit, +if you would like to attribute credits for the changes made in the commit. This +can be done by: + +```sh +Commit message. + +Co-authored-by: Name +``` + +In the Ginkgo commit history, this is most common associated with suggested +improvements from code reviews. + +### Creating, Reviewing and Merging Pull Requests + +* The `develop` branch is the default branch to submit PR's to. From time to + time, we merge the `develop` branch to the `master` branch and create tags on + the `master` to create new releases of Ginkgo. Therefore, all pull requests + must be merged into `develop`. +* Please have a look at the labels and make sure to add the relevant labels. +* You can mark the PR as a `WIP` if you are still working on it, `Ready for + Review` when it is ready for others to review it. +* Assignees to the PR should be the ones responsible for merging that PR. + Currently, it is only possible to assign members within the `ginkgo-project`. +* Each pull request requires at least two approvals before merging. +* PR's created from within the repository will automatically trigger two CI + pipelines on pushing to the branch from the which the PR has been created. The + Github Actions pipeline tests our framework on Mac OSX and on Windows + platforms. Another comprehensive Linux based pipeline is run from a [mirror on + gitlab](https://gitlab.com/ginkgo-project/ginkgo-public-ci/pipelines) and + contains additional checks like static analysis and test coverage. +* Once a PR has been approved and the build has passed, one of the reviewers can + mark the PR as `READY TO MERGE`. At this point the creator/assignee of the PR + *needs to* verify that the branch is up to date with `develop` and rebase it + on `develop` if it is not. + + +## Code style + +### Automatic code formatting + +Ginkgo uses [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html) +(executable is usually named `clang-format`) and a custom `.clang-format` +configuration file (mostly based on ClangFormat's _Google_ style) to +automatically format your code. __Make sure you have ClangFormat set up and +running properly__ ( you should be able to run `make format` from Ginkgo's build +directory) before committing anything that will end up in a pull request against +`ginkgo-project/ginkgo` repository. In addition, you should __never__ modify the +`.clang-format` configuration file shipped with Ginkgo. E.g. if ClangFormat has +trouble reading this file on your system, you should install a newer version of +ClangFormat, and avoid commenting out parts of the configuration file. + +ClangFormat is the primary tool that helps us achieve a uniform look of Ginkgo's +codebase, while reducing the learning curve of potential contributors. However, +ClangFormat configuration is not expressive enough to incorporate the entire +coding style, so there are several additional rules that all contributed code +should follow. + +_Note_: To learn more about how ClangFormat will format your code, see existing +files in Ginkgo, `.clang-format` configuration file shipped with Ginkgo, and +ClangFormat's documentation. + +### Naming scheme + +#### Filenames + +Filenames use `snake_case` and use the following extensions: +* C++ source files: `.cpp` +* C++ header files: `.hpp` +* CUDA source files: `.cu` +* CUDA header files: `.cuh` +* HIP source files: `.hip.cpp` +* HIP header files: `.hip.hpp` +* Common source files used by both CUDA and HIP: `.hpp.inc` +* CMake utility files: `.cmake` +* Shell scripts: `.sh` + +_Note:_ A C++ source/header file is considered a `CUDA` file if it contains CUDA +code that is not guarded with `#if` guards that disable this code in non-CUDA +compilers. I.e. if a file can be compiled by a general C++ compiler, it is not +considered a CUDA file. + +#### Macros + +Macros (both object-like and function-like macros) use `CAPITAL_CASE`. They have +to start with `GKO_` to avoid name clashes (even if they are `#undef`-ed in the +same file!). + +#### Variables + +Variables use `snake_case`. + +#### Constants + +Constants use `snake_case`. + +#### Functions + +Functions use `snake_case`. + +#### Structures and classes + +Structures and classes which do not experience polymorphic behavior (i.e. do not +contain virtual methods, nor members which experience polymorphic behavior) use +`snake_case`. + +All other structures and classes use `CamelCase`. + +#### Members + +All structure / class members use the same naming scheme as they would if they +were not members: +* methods use the naming scheme for functions +* data members the naming scheme for variables or constants +* type members for classes / structures + +Additionally, non-public data members end with an underscore (`_`). + +#### Namespaces + +Namespaces use `snake_case`. + +#### Template parameters + +* Type template parameters use `CamelCase`, for example `ValueType`. +* Non-type template parameters use `snake_case`, for example `subwarp_size`. + +### Whitespace + +Spaces and tabs are handled by ClangFormat, but blank lines are only partially +handled (the current configuration doesn't allow for more than 2 blank lines). +Thus, contributors should be aware of the following rules for blank lines: + +1. Top-level statements and statements directly within namespaces are separated + with 2 blank lines. The first / last statement of a namespace is separated + by two blank lines from the opening / closing brace of the namespace. + 1. _exception_: if the first __or__ the last statement in the namespace is + another namespace, then no blank lines are required + _example_: + ```c++ + namespace foo { + + + struct x { + }; + + + } // namespace foo + + + namespace bar { + namespace baz { + + + void f(); + + + } // namespace baz + } // namespace bar + ``` + + 2. _exception_: in header files whose only purpose is to _declare_ a bunch + of functions (e.g. the `*_kernel.hpp` files) these declarations can be + separated by only 1 blank line (note: standard rules apply for all other + statements that might be present in that file) + 3. _exception_: "related" statement can have 1 blank line between them. + "Related" is not a strictly defined adjective in this sense, but is in + general one of: + + 1. overload of a same function, + 2. function / class template and it's specializations, + 3. macro that modifies the meaning or adds functionality to the + previous / following statement. + + However, simply calling function `f` from function `g` does not imply + that `f` and `g` are "related". +2. Statements within structures / classes are separated with 1 blank line. + There are no blank lines betweeen the first / last statement in the + structure / class. + 1. _exception_: there is no blank line between an access modifier (`private`, `protected`, `public`) and the following statement. + _example_: + ```c++ + class foo { + public: + int get_x() const noexcept { return x_; } + + int &get_x() noexcept { return x_; } + + private: + int x_; + }; + ``` + +3. Function bodies cannot have multiple consecutive blank lines, and a single + blank line can only appear between two logical sections of the function. +4. Unit tests should follow the [AAA](http://wiki.c2.com/?ArrangeActAssert) + pattern, and a single blank line must appear between consecutive "A" + sections. No other blank lines are allowed in unit tests. +5. Enumeration definitions should have no blank lines between consecutive + enumerators. + + +### Include statement grouping + +In general, all include statements should be present on the top of the file, +ordered in the following groups, with two blank lines between each group: + +1. Related header file (e.g. `core/foo/bar.hpp` included in `core/foo/bar.cpp`, + or in the unit test`core/test/foo/bar.cpp`) +2. Standard library headers (e.g. `vector`) +3. Executor specific library headers (e.g. `omp.h`) +4. System third-party library headers (e.g. `papi.h`) +5. Local third-party library headers +6. Public Ginkgo headers +7. Private Ginkgo headers + +_Example_: A file `core/base/my_file.cpp` might have an include list like this: + +```c++ +#include + + +#include +#include +#include + + +#include + + +#include + + +#include "third_party/blas/cblas.hpp" +#include "third_party/lapack/lapack.hpp" + + +#include +#include +#include + + +#include "core/base/my_file_kernels.hpp" +``` + +#### Main header + +This section presents general rules used to define the main header attributed to +the file. In the previous example, this would be ` #include +`. + +General rules: +1. Some fixed main header. +2. components: + - with `_kernel` suffix looks for the header in the same folder. + - without `_kernel` suffix looks for the header in `core`. +3. `test/utils`: looks for the header in `core` +4. `core`: looks for the header in `ginkgo` +5. `test` or `base`: looks for the header in `ginkgo/core` +6. others: looks for the header in `core` + +_Note_: Please see the detail in the `dev_tools/scripts/config`. + +#### Some general comments. + +1. Private headers of Ginkgo should not be included within the public Ginkgo header. +2. It is a good idea to keep the headers self-sufficient, See [Google Style guide for reasoning](https://google.github.io/styleguide/cppguide.html#Self_contained_Headers). +When compiling with `GINKGO_CHECK_CIRCULAR_DEPS` enabled, this property is explicitly checked. +3. The recommendations of the `iwyu` (Include what you use) tool can be used to make sure that the headers are self-sufficient and that the compiled files ( `.cu`, `.cpp`, `.hip.cpp` ) include only what they use. A [CI pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/jobs/584358356) is available that runs with the `iwyu` tool. Please be aware that this tool can be incorrect in some cases. + +#### Automatic header arrangement + +1. `dev_tools/script/format_header.sh` will take care of the group/sorting of + headers according to this guideline. +2. `make format_header` arranges the header of the modified files in the branch. +3. `make format_header_all` arranges the header of all files. + + +### Other Code Formatting not handled by ClangFormat + +#### Control flow constructs + +Single line statements should be avoided in all cases. Use of brackets is +mandatory for all control flow constructs (e.g. `if`, `for`, `while`, ...). + +#### Variable declarations + +C++ supports declaring / defining multiple variables using a single +_type-specifier_. However, this is often very confusing as references and +pointers exhibit strange behavior: + +```c++ +template using pointer = T *; + +int * x, y; // x is a pointer, y is not +pointer x, y; // both x and y are pointers +``` + +For this reason, __always__ declare each variable on a separate line, with its +own _type-specifier_. + +### CMake coding style + +#### Whitespaces + +All alignment in CMake files should use four spaces. + +#### Use of macros vs functions + +Macros in CMake do not have a scope. This means that any variable set in this +macro will be available to the whole project. In contrast, functions in CMake +have local scope and therefore all set variables are local only. In general, +wrap all piece of algorithms using temporary variables in a function and use +macros to propagate variables to the whole project. + +#### Naming style + +All Ginkgo specific variables should be prefixed with a `GINKGO_` and all +functions by `ginkgo_`. + + +## Helper scripts + +To facilitate easy development within Ginkgo and to encourage coders and +scientists who do not want get bogged down by the details of the Ginkgo library, +but rather focus on writing the algorithms and the kernels, Ginkgo provides the +developers with a few helper scripts. + +### Create a new algorithm + +A `create_new_algorithm.sh` script is available for developers to facilitate +easy addition of new algorithms. The options it provides can be queried with + +```sh +./create_new_algorithm.sh --help +``` +The main objective of this script is to add files and boiler plate code for the +new algorithm using a model and an instance of that model. For example, models +can be any one of `factorization`, `matrix`, `preconditioner` or `solver`. For +example to create a new solver named `my_solver` similar to `gmres`, you would +set the `ModelType` to `solver` and set the `ModelName` to `gmres`. This would +duplicate the core algorithm and kernels of the `gmres` algorithm and replace +the naming to `my_solver`. Additionally, all the kernels of the new `my_solver` +are marked as `GKO_NOT_IMPLEMENTED`. For easy navigation and `.txt` file is created +in the folder where the script is run, which lists all the TODO's. These TODO's can +also be found in the corresponding files. + +### Converting CUDA code to HIP code +We provide a `cuda2hip` script that converts `cuda` kernel code into `hip` kernel code. +Internally, this script calls the [`hipify` script](https://github.com/ROCm-Developer-Tools/HIPIFY) provided by HIP, converting the CUDA syntax +to HIP syntax. Additionally, it also automatically replaces the instances of +CUDA with HIP as appropriate. Hence, this script can be called on a Ginkgo CUDA +file. You can find this script in the `dev_tools/scripts/` folder. + + +## Writing Tests + +Ginkgo uses the [GTest framework](https://github.com/google/googletest) for the +unit test framework within Ginkgo. Writing good tests are extremely important to +verify the functionality of the new code and to make sure that none of the +existing code has been broken. + +### Testing know-how + +* GTest provides a [comprehensive + documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md) + of the functionality available within Gtest. +* Reduce code duplication with [Testing Fixtures, + `TEST_F`](https://github.com/google/googletest/blob/master/googletest/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests) +* Write templated tests using + [`TYPED_TEST`](https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests). + +### Some general rules. + +* Unit tests must follow the [KISS + principle](https://en.wikipedia.org/wiki/KISS_principle). +* Unit tests must follow the [AAA](http://wiki.c2.com/?ArrangeActAssert) + pattern, and a single blank line must appear between consecutive "A" sections. + +### Writing tests for kernels + +* Reference kernels, kernels on the `ReferenceExecutor`, are meant to be single + threaded reference implementations. Therefore, tests for reference kernels + need to be performed with data that can be as small as possible. For example, + matrices lesser than 5x5 are acceptable. This allows the reviewers to verify + the results for exactness with tools such as MATLAB. +* OpenMP, CUDA and HIP kernels have to be tested against the reference kernels. + Hence data for the tests of these kernels can be generated in the test files + using helper functions or by using external files to be read through the + standard input. In particular for CUDA and HIP, the data size should be at + least bigger than the architecture's warp size to ensure there is no corner + case in the kernels. + + +## Documentation style + +Documentation uses standard Doxygen. + +### Developer targeted notes + +Make use of `@internal` doxygen tag. This can be used for any comment which is +not intended for users, but is useful to better understand a piece of code. + +### Whitespaces + +#### After named tags such as `@param foo` + +The documentation tags which use an additional name should be followed by two +spaces in order to better distinguish the text from the doxygen tag. It is also +possible to use a line break instead. + +### Documenting examples + +There are two main steps: + +1. First, you can just copy over the + [`doc/`](https://github.com/ginkgo-project/ginkgo/tree/master/examples/simple-solver) + folder (you can copy it from the example most relevant to you) and adapt your + example names and such, then you can modify the actual documentation. ++ In `tooltip`: A short description of the example. ++ In `short-intro`: The name of the example. ++ In `results.dox`: Run the example and write the output you get. ++ In `kind`: The kind of the example. For different kinds see [the + documentation](https://ginkgo-project.github.io/ginkgo/doc/master/Examples.html). + Examples can be of `basic`, `techniques`, `logging`, `stopping_criteria` or + `preconditioners`. If your example does not fit any of these categories, feel + free to create one. ++ In `intro.dox`: You write an explanation of your code with some introduction + similar to what you see in an existing example most relevant to you. ++ In `builds-on`: You write the examples it builds on. + +2. You also need to modify the + [examples.hpp.in](https://github.com/ginkgo-project/ginkgo/blob/master/doc/examples/examples.hpp.in) + file. You add the name of the example in the main section and in the section + that you specified in the `doc/kind` file in the example documentation. + + +## Other programming comments + +### C++ standard stream objects + +These are global objects and are shared inside the same translation unit. +Therefore, whenever its state or formatting is changed (e.g. using `std::hex` or +floating point formatting) inside library code, make sure to restore the state +before returning the control to the user. See this [stackoverflow +question](https://stackoverflow.com/questions/2273330/restore-the-state-of-stdcout-after-manipulating-it) +for examples on how to do it correctly. This is extremely important for header +files. + +### Warnings + +By default, the `-DGINKGO_COMPILER_FLAGS` is set to `-Wpedantic` and hence +pedantic warnings are emitted by default. Some of these warnings are false +positives and a complete list of the resolved warnings and their solutions is +listed in [Issue 174](https://github.com/ginkgo-project/ginkgo/issues/174). +Specifically, when macros are being used, we have the issue of having `extra ;` +warnings, which is resolved by adding a `static_assert()`. The CI system +additionally also has a step where it compiles for pedantic warnings to be +errors. + +### Avoiding circular dependencies + +To facilitate finding circular dependencies issues (see [Using library +classes](#using-library-classes) for more details), a CI step `no-circular-deps` +was created. For more details on its usage, see [this +pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/pipelines/52941979), +where Ginkgo did not abide to this policy and [PR +#278](https://github.com/ginkgo-project/ginkgo/pull/278) which fixed this. Note +that doing so is not enough to guarantee with 100% accuracy that no circular +dependency is present. For an example of such a case, take a look at [this +pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/pipelines/53006772) +where one of the compiler setups detected an incorrect dependency of the `cuda` +module (due to jacobi) on the `core` module. diff --git a/INSTALL.md b/INSTALL.md index d542d4d27a9..a3456d24ff7 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,6 +1,6 @@ Installation Instructions {#install_ginkgo} ------------------------------------- -### Building +### Building Use the standard cmake build procedure: @@ -17,7 +17,7 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development (requires clang-format, will also download git-cmake-format), - default is `ON`. + default is `OFF`. * `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests (will download googletest), default is `ON`. * `-DGINKGO_BUILD_BENCHMARKS={ON, OFF}` builds Ginkgo's benchmarks @@ -28,9 +28,15 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_BUILD_REFERENCE={ON, OFF}` build reference implementations of the kernels, useful for testing, default is `ON` * `-DGINKGO_BUILD_OMP={ON, OFF}` builds optimized OpenMP versions of the kernels, - default is `OFF` + default is `ON` if the selected C++ compiler supports OpenMP, `OFF` otherwise. * `-DGINKGO_BUILD_CUDA={ON, OFF}` builds optimized cuda versions of the kernels - (requires CUDA), default is `OFF` + (requires CUDA), default is `ON` if a CUDA compiler could be detected, + `OFF` otherwise. +* `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels + (requires HIP), default is `ON` if an installation of HIP could be detected, + `OFF` otherwise. +* `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable + passed to hipcc for the `hcc` HIP backend. The default is none (auto). * `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation from inline comments in the code. The default is `OFF`. * `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples @@ -47,6 +53,9 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_WITH_IWYU={ON, OFF}` makes Ginkgo call `iwyu` to find include issues. The path can be manually controlled with the CMake variable `-DGINKGO_IWYU_PATH=`. The default is `OFF`. +* `-DGINKGO_CHECK_CIRCULAR_DEPS={ON, OFF}` enables compile-time checks for + circular dependencies between different Ginkgo libraries and self-sufficient + headers. Should only be used for development purposes. The default is `OFF`. * `-DGINKGO_VERBOSE_LEVEL=integer` sets the verbosity of Ginkgo. * `0` disables all output in the main libraries, * `1` enables a few important messages related to unexpected behavior (default). @@ -54,8 +63,9 @@ Ginkgo adds the following additional switches to control what is being built: The default value is usually something like `/usr/local`. * `-DCMAKE_BUILD_TYPE=type` specifies which configuration will be used for this build of Ginkgo. The default is `RELEASE`. Supported values are CMake's - standard build types such as `DEBUG` and `RELEASE` and the Ginkgo specific - `COVERAGE`, `ASAN` (AddressSanitizer) and `TSAN` (ThreadSanitizer) types. + standard build types such as `DEBUG` and `RELEASE` and the Ginkgo specific + `COVERAGE`, `ASAN` (AddressSanitizer), `LSAN` (LeakSanitizer), `TSAN` + (ThreadSanitizer) and `UBSAN` (undefined behavior sanitizer) types. * `-DBUILD_SHARED_LIBS={ON, OFF}` builds ginkgo as shared libraries (`OFF`) or as dynamic libraries (`ON`), default is `ON`. * `-DGINKGO_JACOBI_FULL_OPTIMIZATIONS={ON, OFF}` use all the optimizations @@ -99,14 +109,110 @@ For example, to build everything (in debug mode), use: ```cmake cmake -G "Unix Makefiles" -H. -BDebug -DCMAKE_BUILD_TYPE=Debug -DGINKGO_DEVEL_TOOLS=ON \ - -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_REFERENCE=ON -DGINKGO_BUILD_OMP=ON \ - -DGINKGO_BUILD_CUDA=ON + -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_REFERENCE=ON -DGINKGO_BUILD_OMP=ON \ + -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_HIP=ON cmake --build Debug ``` NOTE: Ginkgo is known to work with the `Unix Makefiles` and `Ninja` based generators. Other CMake generators are untested. +### Building Ginkgo with HIP support +Ginkgo provides a [HIP](https://github.com/ROCm-Developer-Tools/HIP) backend. +This allows to compile optimized versions of the kernels for either AMD or +NVIDIA GPUs. The CMake configuration step will try to auto-detect the presence +of HIP either at `/opt/rocm/hip` or at the path specified by `HIP_PATH` as a +CMake parameter (`-DHIP_PATH=`) or environment variable (`export HIP_PATH=`), +unless `-DGINKGO_BUILD_HIP=ON/OFF` is set explicitly. + +#### Correctly installing HIP toolkits and dependencies for Ginkgo +In general, Ginkgo's HIP backend requires the following packages: ++ HIP, ++ hipBLAS, ++ hipSPARSE, ++ Thrust. + +It is necessary to provide some details about the different ways to +procure and install these packages, in particular for NVIDIA systems since +getting a correct, non bloated setup is not straightforward. + +For AMD systems, the simplest way is to follow the [instructions provided +here](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md) which +provide package installers for most Linux distributions. Ginkgo also needs the +installation of the [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS) +and [hipSPARSE](https://github.com/ROCmSoftwarePlatform/hipSPARSE) interfaces. +Optionally if you do not already have a thrust installation, [the ROCm provided +rocThrust package can be +used](https://github.com/ROCmSoftwarePlatform/rocThrust). + +For NVIDIA systems, the traditional installation (package `hip_nvcc`), albeit +working properly is currently odd: it depends on all the `hcc` related packages, +although the `nvcc` backend seems to entirely rely on the CUDA suite. [See this +issue for more +details](https://github.com/ROCmSoftwarePlatform/hipBLAS/issues/53). It is +advised in this case to compile everything manually, including using forks of +`hipBLAS` and `hipSPARSE` specifically made to not depend on the `hcc` specific +packages. `Thrust` is often provided by CUDA and this Thrust version should work +with `HIP`. Here is a sample procedure for installing `HIP`, `hipBLAS` and +`hipSPARSE`. + + +```bash +# HIP +git clone https://github.com/ROCm-Developer-Tools/HIP.git +pushd HIP && mkdir build && pushd build +cmake .. && make install +popd && popd + +# hipBLAS +git clone https://github.com/tcojean/hipBLAS.git +pushd hipBLAS && mkdir build && pushd build +cmake .. && make install +popd && popd + +# hipSPARSE +git clone https://github.com/tcojean/hipSPARSE.git +pushd hipSPARSE && mkdir build && pushd build +cmake -DBUILD_CUDA=ON .. && make install +popd && popd +``` + + +#### Changing the paths to search for HIP and other packages +All HIP installation paths can be configured through the use of environment +variables or CMake variables. This way of configuring the paths is currently +imposed by the `HIP` tool suite. The variables are the following: ++ CMake `-DHIP_PATH=` or environment `export HIP_PATH=`: sets the `HIP` + installation path. The default value is `/opt/rocm/hip`. ++ CMake `-DHIPBLAS_PATH=` or environment `export HIPBLAS_PATH=`: sets the + `hipBLAS` installation path. The default value is `/opt/rocm/hipblas`. ++ CMake `-DHIPSPARSE_PATH=` or environment `export HIPSPARSE_PATH=`: sets the + `hipSPARSE` installation path. The default value is `/opt/rocm/hipsparse`. ++ CMake `-DHCC_PATH=` or environment `export HCC_PATH=`: sets the `HCC` + installation path, for AMD backends. The default value is `/opt/rocm/hcc`. ++ environment `export CUDA_PATH=`: where `hipcc` can find `CUDA` if it is not in + the default `/usr/local/cuda` path. + + +#### HIP platform detection of AMD and NVIDIA +By default, Ginkgo uses the output of `/opt/rocm/hip/bin/hipconfig --platform` +to select the backend. The accepted values are either `hcc` (AMD) or `nvcc` +(NVIDIA). When on an AMD or NVIDIA system, this should output the correct +platform by default. When on a system without GPUs, this should output `hcc` by +default. To change this value, export the environment variable `HIP_PLATFORM` +like so: +```bash +export HIP_PLATFORM=nvcc +``` + +#### Setting platform specific compilation flags +Platform specific compilation flags can be given through the following +CMake variables: ++ `-DGINKGO_HIP_COMPILER_FLAGS=`: compilation flags given to all platforms. ++ `-DGINKGO_HIP_HCC_COMPILER_FLAGS=`: compilation flags given to AMD platforms. ++ `-DGINKGO_HIP_NVCC_COMPILER_FLAGS=`: compilation flags given to NVIDIA platforms. + + ### Third party libraries and packages Ginkgo relies on third party packages in different cases. These third party diff --git a/LICENSE b/LICENSE index efb4bb6d9bf..48867b57a87 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 7a8ba605cdb..48d344e2c6e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ ![Ginkgo](/assets/logo.png) -[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/build.svg)](https://github.com/ginkgo-project/ginkgo/commits/master) +[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/pipeline.svg)](https://github.com/ginkgo-project/ginkgo/commits/master) +[![OSX-build](https://github.com/ginkgo-project/ginkgo/workflows/OSX-build/badge.svg?branch=master)](https://github.com/ginkgo-project/ginkgo/actions?query=workflow%3AOSX-build+branch%3Amaster) +[![Windows-build](https://github.com/ginkgo-project/ginkgo/workflows/windows-build/badge.svg?branch=master)](https://github.com/ginkgo-project/ginkgo/actions?query=workflow%3AWindows-build+branch%3Amaster) +[![codecov](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master/graph/badge.svg)](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master) +[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo) +[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=reliability_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo) + [![CDash dashboard](https://img.shields.io/badge/CDash-Access-blue.svg)](http://my.cdash.org/index.php?project=Ginkgo+Project) [![Documentation](https://img.shields.io/badge/Documentation-latest-blue.svg)](https://ginkgo-project.github.io/ginkgo/doc/master/) [![License](https://img.shields.io/github/license/ginkgo-project/ginkgo.svg)](./LICENSE) @@ -9,7 +15,7 @@ Ginkgo is a high-performance linear algebra library for manycore systems, with a focus on sparse solution of linear systems. It is implemented using modern C++ (you will need at least C++11 compliant compiler to build it), with GPU kernels -implemented in CUDA. +implemented in CUDA and HIP. Performance @@ -31,7 +37,7 @@ For Ginkgo core library: * _cmake 3.9+_ * C++11 compliant compiler, one of: - * _gcc 5.3+, 6.3+, 7.3+, 8.1+_ + * _gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+_ * _clang 3.9+_ * _Intel compiler 2017+_ * _Apple LLVM 8.0+_ (__TODO__: verify) @@ -44,20 +50,28 @@ The Ginkgo CUDA module has the following __additional__ requirements: [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) or [CUDA installation guide for Mac Os X](https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html) + In addition, if you want to contribute code to Ginkgo, you will also need the following: -* _clang-format 5.0.1+_ (ships as part of _clang_) +* _clang-format 5.0.0+_ (ships as part of _clang_) * _clang-tidy_ (optional, when setting the flag `-DGINKGO_WITH_CLANG_TIDY=ON`) * _iwyu_ (Include What You Use, optional, when setting the flag `-DGINKGO_WITH_IWYU=ON`) +The Ginkgo HIP module has the following __additional__ requirements: + +* _ROCm 2.8+_ +* the HIP, hipBLAS and hipSPARSE packages compiled with either: + * _AMD_ backend + * _CUDA 9.0+_ backend. When using CUDA 10+, _cmake 3.12.2+_ is required. + ### Windows The prequirement needs to be verified * _cmake 3.9+_ * C++11 compliant 64-bits compiler: - * _MinGW : gcc 5.3+, 6.3+, 7.3+, 8.1+_ - * _Cygwin : gcc 5.3+, 6.3+, 7.3+, 8.1+_ + * _MinGW : gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+_ + * _Cygwin : gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+_ * _Microsoft Visual Studio : VS 2017 15.7+_ __NOTE:__ Need to add `--autocrlf=input` after `git clone` in _Cygwin_. @@ -128,12 +142,10 @@ Name Surname Institution(s) #### Contributing guidelines -Contributing guidelines can be accessed in our Wiki under the [Developer's -Homepage](https://github.com/ginkgo-project/ginkgo/wiki/Developers-Homepage). -This page also contains other information useful to developers, such as writing -proper commit messages, understanding Ginkgo's library design, relevant C++ -information, and more. In general, always refer to this page for developer -information. +Contributing guidelines can be accessed in the [CONTRIBUTING.md +page](./CONTRIBUTING.md). This page also contains other information useful to +developers, such as writing proper commit messages, understanding Ginkgo's +library design, relevant C++ information, and more. ### Support If you have any question, bug to report or would like to propose a new feature, @@ -152,3 +164,23 @@ Depending on the configuration options used when building Ginkgo, third party software may be pulled as additional dependencies, which have their own licensing conditions. Refer to [ABOUT-LICENSING.md](ABOUT-LICENSING.md) for details. + +Citing Ginkgo +------------- + +The main Ginkgo paper describing Ginkgo's purpose, design and interface is +available through the following reference: + +``` bibtex +@misc{anzt2020ginkgo, + title={Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing}, + author={Hartwig Anzt and Terry Cojean and Goran Flegar and Fritz Göbel and Thomas Grützmacher and Pratik Nayak and Tobias Ribizel and Yuhsiang Mike Tsai and Enrique S. Quintana-Ortí}, + year={2020}, + eprint={2006.16852}, + archivePrefix={arXiv}, + primaryClass={cs.MS} +} +``` + +For more information on topical subjects, please refer to the [CITING.md +page](CITING.md). diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index cbfe9975edc..e786c63d5ed 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -4,6 +4,13 @@ if (NOT CMAKE_BUILD_TYPE STREQUAL "Release") "will be affected") endif() +if (GINKGO_BUILD_CUDA AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "hcc") + message(FATAL_ERROR "Building the benchmarks for both HIP AMD and CUDA " + "at the same time is currently not supported. " + "Disable the benchmark build using `-DGINKGO_BUILD_BENCHMARKS=OFF` " + "or use `export HIP_PLATFORM=nvcc` in your build environment instead.") +endif() + function(ginkgo_benchmark_cusp_linops name) target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) target_link_libraries("${name}" ginkgo ${CUDA_RUNTIME_LIBS} @@ -14,6 +21,35 @@ function(ginkgo_benchmark_cusp_linops name) endif() endfunction() +function(ginkgo_benchmark_hipsp_linops name) + target_compile_definitions("${name}" PRIVATE HAS_HIP=1) + EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) + set_target_properties("${name}" PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) + # for some reason, HIP creates a dependency on Threads::Threads here, so we + # need to find it + find_package(Threads REQUIRED) + find_package(HIP REQUIRED) + find_package(hipblas REQUIRED) + find_package(hipsparse REQUIRED) + target_include_directories("${name}" SYSTEM PRIVATE + ${HSA_HEADER} ${HIP_INCLUDE_DIRS} + ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) + + if(GINKGO_HIP_PLATFORM MATCHES "hcc") + ginkgo_hip_ban_link_hcflag(hcc::hccrt) + ginkgo_hip_ban_link_hcflag(hcc::hc_am) + ginkgo_hip_ban_link_hcflag(hcc::mcwamp) + ginkgo_hip_ban_compile_hcflag(hcc::hccrt) + ginkgo_hip_ban_compile_hcflag(hcc::hc_am) + ginkgo_hip_ban_compile_hcflag(hcc::mcwamp) + ginkgo_hip_clang_ban_hip_device_flags() + target_link_libraries("${name}" hip::device) + else() + target_link_libraries("${name}" ${HIP_CUDA_LIBRARIES}) + endif() + target_link_libraries("${name}" ${HIPSPARSE_LIBRARIES}) +endfunction() + add_subdirectory(conversions) add_subdirectory(matrix_generator) add_subdirectory(matrix_statistics) diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp index 7921c31fa81..d2cc6c147e9 100644 --- a/benchmark/conversions/conversions.cpp +++ b/benchmark/conversions/conversions.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -195,5 +195,5 @@ int main(int argc, char *argv[]) } } - std::cout << test_cases; + std::cout << test_cases << std::endl; } diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp index 9f127b11cff..7622d2cd4ed 100644 --- a/benchmark/matrix_generator/matrix_generator.cpp +++ b/benchmark/matrix_generator/matrix_generator.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -72,6 +72,7 @@ std::string input_format = } // namespace +// clang-format off // input validation [[noreturn]] void print_config_error_and_exit(int code = 1) { @@ -79,6 +80,7 @@ std::string input_format = << input_format << std::endl; std::exit(code); } +// clang-format on void validate_option_object(const rapidjson::Value &value) @@ -151,5 +153,5 @@ int main(int argc, char *argv[]) } } - std::cout << configurations; + std::cout << configurations << std::endl; } diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index c0f6a86f70c..72e899407d7 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -72,18 +72,21 @@ void compute_summary(const std::vector &dist, // clang-format on add_or_set_member(out, "min", dist[0], allocator); - add_or_set_member(out, "q1", - coefs[r][0] * dist[positions[r][0]] + - coefs[r][1] * dist[positions[r][1]], - allocator); - add_or_set_member(out, "median", - coefs[r][2] * dist[positions[r][2]] + - coefs[r][3] * dist[positions[r][3]], - allocator); - add_or_set_member(out, "q3", - coefs[r][4] * dist[positions[r][4]] + - coefs[r][5] * dist[positions[r][5]], - allocator); + add_or_set_member( + out, "q1", + coefs[r][0] * static_cast(dist[positions[r][0]]) + + coefs[r][1] * static_cast(dist[positions[r][1]]), + allocator); + add_or_set_member( + out, "median", + coefs[r][2] * static_cast(dist[positions[r][2]]) + + coefs[r][3] * static_cast(dist[positions[r][3]]), + allocator); + add_or_set_member( + out, "q3", + coefs[r][4] * static_cast(dist[positions[r][4]]) + + coefs[r][5] * static_cast(dist[positions[r][5]]), + allocator); add_or_set_member(out, "max", dist[dist.size() - 1], allocator); } @@ -94,11 +97,12 @@ double compute_moment(int degree, const std::vector &dist, if (normalization == 0.0) { return 0.0; } - auto moment = 0.0; + double moment = 0.0; for (const auto &x : dist) { - moment += std::pow(x - center, degree); + moment += std::pow(static_cast(x) - center, degree); } - return moment / dist.size() / std::pow(normalization, degree); + return moment / static_cast(dist.size()) / + std::pow(normalization, static_cast(degree)); } @@ -208,5 +212,5 @@ int main(int argc, char *argv[]) } } - std::cout << test_cases; + std::cout << test_cases << std::endl; } diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 0fc19054d85..11979fd6ba1 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -52,9 +52,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. DEFINE_uint32(max_block_size, 32, "Maximal block size of the block-Jacobi preconditioner"); -DEFINE_string(preconditioners, "jacobi", - "A comma-separated list of solvers to run." - "Supported values are: jacobi"); +DEFINE_uint32(num_iterations, 5, + "Number of iterations for the ParICT/ParILU(T) preconditioner"); + +DEFINE_bool( + approx_select, true, + "Use approximate selection for the threshold filtering in ParICT/ParILUT"); + +DEFINE_double(fill_limit, 2.0, "The fill-in limit used in ParICT/ParILUT"); + +DEFINE_string(preconditioners, "jacobi,parilu,parilut,ilu", + "A comma-separated list of preconditioners to run." + "Supported values are: jacobi, parict, parilu, parilut, ilu"); DEFINE_string(storage_optimization, "0,0", "Defines the kind of storage optimization to perform on " @@ -90,13 +99,55 @@ gko::precision_reduction parse_storage_optimization(const std::string &flag) const std::map( std::shared_ptr exec)>> precond_factory{ - {"jacobi", [](std::shared_ptr exec) { + {"jacobi", + [](std::shared_ptr exec) { return gko::preconditioner::Jacobi::build() .with_max_block_size(FLAGS_max_block_size) .with_storage_optimization( parse_storage_optimization(FLAGS_storage_optimization)) .with_accuracy(FLAGS_accuracy) .on(exec); + }}, + {"parict", + [](std::shared_ptr exec) { + auto ict_fact = std::shared_ptr( + gko::factorization::ParIct::build() + .with_iterations(FLAGS_num_iterations) + .with_approximate_select(FLAGS_approx_select) + .with_fill_in_limit(FLAGS_fill_limit) + .on(exec)); + return gko::preconditioner::Ilu<>::build() + .with_factorization_factory(ict_fact) + .on(exec); + }}, + {"parilu", + [](std::shared_ptr exec) { + auto ilu_fact = std::shared_ptr( + gko::factorization::ParIlu::build() + .with_iterations(FLAGS_num_iterations) + .on(exec)); + return gko::preconditioner::Ilu<>::build() + .with_factorization_factory(ilu_fact) + .on(exec); + }}, + {"parilut", + [](std::shared_ptr exec) { + auto ilut_fact = std::shared_ptr( + gko::factorization::ParIlut::build() + .with_iterations(FLAGS_num_iterations) + .with_approximate_select(FLAGS_approx_select) + .with_fill_in_limit(FLAGS_fill_limit) + .on(exec)); + return gko::preconditioner::Ilu<>::build() + .with_factorization_factory(ilut_fact) + .on(exec); + }}, + {"ilu", [](std::shared_ptr exec) { + auto ilu_fact = std::shared_ptr( + gko::factorization::Ilu::build().on(exec)); + return gko::preconditioner::Ilu<>::build() + .with_factorization_factory(ilu_fact) + .on(exec); }}}; @@ -105,12 +156,34 @@ const std::map( std::string encode_parameters(const char *precond_name) { static std::map encoder{ - {"jacobi", [] { + {"jacobi", + [] { std::ostringstream oss; oss << "jacobi-" << FLAGS_max_block_size << "-" << FLAGS_storage_optimization; return oss.str(); - }}}; + }}, + {"parict", + [] { + std::ostringstream oss; + oss << "parict-" << FLAGS_num_iterations << '-' + << FLAGS_approx_select << '-' << FLAGS_fill_limit; + return oss.str(); + }}, + {"parilu", + [] { + std::ostringstream oss; + oss << "parilu-" << FLAGS_num_iterations; + return oss.str(); + }}, + {"parilut", + [] { + std::ostringstream oss; + oss << "parilut-" << FLAGS_num_iterations << '-' + << FLAGS_approx_select << '-' << FLAGS_fill_limit; + return oss.str(); + }}, + {"ilu", [] { return std::string{"ilu"}; }}}; return encoder[precond_name](); } @@ -196,7 +269,8 @@ void run_preconditioner(const char *precond_name, auto x_clone = clone(x); auto precond = precond_factory.at(precond_name)(exec); - auto gen_logger = std::make_shared(exec); + auto gen_logger = + std::make_shared(exec, FLAGS_nested_names); exec->add_logger(gen_logger); std::unique_ptr precond_op; for (auto i = 0u; i < FLAGS_repetitions; ++i) { @@ -207,7 +281,8 @@ void run_preconditioner(const char *precond_name, gen_logger->write_data(this_precond_data["generate"]["components"], allocator, FLAGS_repetitions); - auto apply_logger = std::make_shared(exec); + auto apply_logger = + std::make_shared(exec, FLAGS_nested_names); exec->add_logger(apply_logger); for (auto i = 0u; i < FLAGS_repetitions; ++i) { precond_op->apply(lend(b), lend(x_clone)); @@ -310,5 +385,5 @@ int main(int argc, char *argv[]) } } - std::cout << test_cases; + std::cout << test_cases << std::endl; } diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index 64e2b5944b2..6a782bac322 100644 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -30,6 +30,26 @@ if [ ! "${PRECONDS}" ]; then PRECONDS="none" fi +if [ ! "${FORMATS}" ]; then + echo "FORMATS environment variable not set - assuming \"csr,coo,ell,hybrid,sellp\"" 1>&2 + FORMATS="csr,coo,ell,hybrid,sellp" +fi + +if [ ! "${SOLVERS}" ]; then + echo "SOLVERS environment variable not set - assuming \"bicgstab,cg,cgs,fcg,gmres\"" 1>&2 + SOLVERS="bicgstab,cg,cgs,fcg,gmres" +fi + +if [ ! "${SOLVERS_PRECISION}" ]; then + echo "SOLVERS_PRECISION environment variable not set - assuming \"1e-6\"" 1>&2 + SOLVERS_PRECISION=1e-6 +fi + +if [ ! "${SOLVERS_MAX_ITERATIONS}" ]; then + echo "SOLVERS_MAX_ITERATIONS environment variable not set - assuming \"10000\"" 1>&2 + SOLVERS_MAX_ITERATIONS=10000 +fi + if [ ! "${SYSTEM_NAME}" ]; then echo "SYSTEM_MANE environment variable not set - assuming \"unknown\"" 1>&2 SYSTEM_NAME="unknown" @@ -40,6 +60,31 @@ if [ ! "${DEVICE_ID}" ]; then DEVICE_ID="0" fi +# Control whether to run detailed benchmarks or not. +# Default setting is detailed=false. To activate, set DETAILED=1. +if [ ! "${DETAILED}" ] || [ "${DETAILED}" -eq 0 ]; then + DETAILED_STR="--detailed=false" +else + DETAILED_STR="--detailed=true" +fi + +# This allows using a matrix list file for benchmarking. +# The file should contains a suitesparse matrix on each line. +# The allowed formats to target suitesparse matrix is: +# id or group/name or name. +# Example: +# 1903 +# Freescale/circuit5M +# thermal2 +if [ ! "${MATRIX_LIST_FILE}" ]; then + use_matrix_list_file=0 +elif [ -f "${MATRIX_LIST_FILE}" ]; then + use_matrix_list_file=1 +else + echo -e "A matrix list file was set to ${MATRIX_LIST_FILE} but it cannot be found." + exit 1 +fi + ################################################################################ # Utilities @@ -87,7 +132,7 @@ run_conversion_benchmarks() { [ "${DRY_RUN}" == "true" ] && return cp "$1" "$1.imd" # make sure we're not loosing the original input ./conversions/conversions --backup="$1.bkp" --double_buffer="$1.bkp2" \ - --executor="${EXECUTOR}" --formats="csr,coo,hybrid,sellp,ell" \ + --executor="${EXECUTOR}" --formats="${FORMATS}" \ --device_id="${DEVICE_ID}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" @@ -103,7 +148,7 @@ run_spmv_benchmarks() { [ "${DRY_RUN}" == "true" ] && return cp "$1" "$1.imd" # make sure we're not loosing the original input ./spmv/spmv --backup="$1.bkp" --double_buffer="$1.bkp2" \ - --executor="${EXECUTOR}" --formats="csr,coo,hybrid,sellp,ell" \ + --executor="${EXECUTOR}" --formats="${FORMATS}" \ --device_id="${DEVICE_ID}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" @@ -119,10 +164,10 @@ run_solver_benchmarks() { [ "${DRY_RUN}" == "true" ] && return cp "$1" "$1.imd" # make sure we're not loosing the original input ./solver/solver --backup="$1.bkp" --double_buffer="$1.bkp2" \ - --executor="${EXECUTOR}" --solvers="cg,bicgstab,cgs,fcg" \ + --executor="${EXECUTOR}" --solvers="${SOLVERS}" \ --preconditioners="${PRECONDS}" \ - --max_iters=10000 --rel_res_goal=1e-6 \ - --device_id="${DEVICE_ID}" \ + --max_iters=${SOLVERS_MAX_ITERATIONS} --rel_res_goal=${SOLVERS_PRECISION} \ + ${DETAILED_STR} --device_id="${DEVICE_ID}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -173,9 +218,42 @@ generate_suite_sparse_input() { EOT } +parse_matrix_list() { + local source_list_file=$1 + local benchmark_list="" + local id=0 + for mtx in $(cat ${source_list_file}); do + if [[ ! "$mtx" =~ ^[0-9]+$ ]]; then + if [[ "$mtx" =~ ^[a-zA-Z0-9_-]+$ ]]; then + id=$(${SSGET} -s "[ @name == $mtx ]") + elif [[ "$mtx" =~ ^([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)$ ]]; then + local group="${BASH_REMATCH[1]}" + local name="${BASH_REMATCH[2]}" + id=$(${SSGET} -s "[ @name == $name ] && [ @group == $group ]") + else + >&2 echo -e "Could not recognize entry $mtx." + fi + else + id=$mtx + fi + benchmark_list="$benchmark_list $id" + done + echo "$benchmark_list" +} + +if [ $use_matrix_list_file -eq 1 ]; then + MATRIX_LIST=($(parse_matrix_list $MATRIX_LIST_FILE)) + NUM_PROBLEMS=${#MATRIX_LIST[@]} +fi + LOOP_START=$((1 + (${NUM_PROBLEMS}) * (${SEGMENT_ID} - 1) / ${SEGMENTS})) LOOP_END=$((1 + (${NUM_PROBLEMS}) * (${SEGMENT_ID}) / ${SEGMENTS})) -for (( i=${LOOP_START}; i < ${LOOP_END}; ++i )); do +for (( p=${LOOP_START}; p < ${LOOP_END}; ++p )); do + if [ $use_matrix_list_file -eq 1 ]; then + i=${MATRIX_LIST[$((p-1))]} + else + i=$p + fi if [ "${BENCHMARK}" == "preconditioner" ]; then break fi diff --git a/benchmark/solver/CMakeLists.txt b/benchmark/solver/CMakeLists.txt index fc1d203ca05..1faae042b24 100644 --- a/benchmark/solver/CMakeLists.txt +++ b/benchmark/solver/CMakeLists.txt @@ -2,4 +2,7 @@ add_executable(solver solver.cpp) target_link_libraries(solver ginkgo gflags rapidjson) if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusp_linops(solver) +endif() +if (GINKGO_BUILD_HIP) + ginkgo_benchmark_hipsp_linops(solver) endif() \ No newline at end of file diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 7885c30511c..f043977ab9a 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -45,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" #include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/overhead_linop.hpp" // some Ginkgo shortcuts @@ -57,21 +59,40 @@ DEFINE_uint32(max_iters, 1000, DEFINE_double(rel_res_goal, 1e-6, "The relative residual goal of the solver"); -DEFINE_string(solvers, "cg", - "A comma-separated list of solvers to run." - "Supported values are: bicgstab, cg, cgs, fcg, gmres"); +DEFINE_string( + solvers, "cg", + "A comma-separated list of solvers to run. " + "Supported values are: bicgstab, bicg, cg, cgs, fcg, gmres, overhead"); -DEFINE_string(preconditioners, "none", - "A comma-separated list of preconditioners to use." - "Supported values are: none, jacobi, adaptive-jacobi"); +DEFINE_string( + preconditioners, "none", + "A comma-separated list of preconditioners to use. " + "Supported values are: none, jacobi, adaptive-jacobi, parict, parilu, " + "parilut, ilu, overhead"); + +DEFINE_uint32(parilu_iterations, 5, + "The number of iterations for ParICT/ParILU(T)"); + +DEFINE_bool(parilut_approx_select, true, + "Use approximate selection for ParICT/ParILUT"); + +DEFINE_double(parilut_limit, 2.0, "The fill-in limit for ParICT/ParILUT"); DEFINE_uint32( nrhs, 1, "The number of right hand sides. Record the residual only when nrhs == 1."); +// This allows to benchmark the overhead of a solver by using the following +// data: A=[1.0], x=[0.0], b=[nan]. This data can be used to benchmark normal +// solvers or using the argument --solvers=overhead, a minimal solver will be +// launched which contains only a few kernel calls. +DEFINE_bool(overhead, false, + "If set, uses dummy data to benchmark Ginkgo overhead"); + // input validation -[[noreturn]] void print_config_error_and_exit() { +[[noreturn]] void print_config_error_and_exit() +{ std::cerr << "Input has to be a JSON array of matrix configurations:\n" << " [\n" << " { \"filename\": \"my_file.mtx\", \"optimal\": { " @@ -116,10 +137,12 @@ const std::map( std::shared_ptr, std::shared_ptr)>> solver_factory{{"bicgstab", create_solver>}, + {"bicg", create_solver>}, {"cg", create_solver>}, {"cgs", create_solver>}, {"fcg", create_solver>}, - {"gmres", create_solver>}}; + {"gmres", create_solver>}, + {"overhead", create_solver>}}; // TODO: Workaround until GPU matrix conversions are implemented @@ -165,7 +188,8 @@ const std::map( return std::unique_ptr( new ReferenceFactoryWrapper(f)); }}, - {"adaptive-jacobi", [](std::shared_ptr exec) { + {"adaptive-jacobi", + [](std::shared_ptr exec) { std::shared_ptr f = gko::preconditioner::Jacobi<>::build() .with_storage_optimization( @@ -173,6 +197,66 @@ const std::map( .on(exec); return std::unique_ptr( new ReferenceFactoryWrapper(f)); + }}, + {"parict", + [](std::shared_ptr exec) { + auto fact = std::shared_ptr( + gko::factorization::ParIct<>::build() + .with_iterations(FLAGS_parilu_iterations) + .with_approximate_select(FLAGS_parilut_approx_select) + .with_fill_in_limit(FLAGS_parilut_limit) + .on(exec)); + std::shared_ptr f = + gko::preconditioner::Ilu<>::build() + .with_factorization_factory(fact) + .on(exec); + return std::unique_ptr( + new ReferenceFactoryWrapper(f)); + }}, + {"parilu", + [](std::shared_ptr exec) { + auto fact = std::shared_ptr( + gko::factorization::ParIlu<>::build() + .with_iterations(FLAGS_parilu_iterations) + .on(exec)); + std::shared_ptr f = + gko::preconditioner::Ilu<>::build() + .with_factorization_factory(fact) + .on(exec); + return std::unique_ptr( + new ReferenceFactoryWrapper(f)); + }}, + {"parilut", + [](std::shared_ptr exec) { + auto fact = std::shared_ptr( + gko::factorization::ParIlut<>::build() + .with_iterations(FLAGS_parilu_iterations) + .with_approximate_select(FLAGS_parilut_approx_select) + .with_fill_in_limit(FLAGS_parilut_limit) + .on(exec)); + std::shared_ptr f = + gko::preconditioner::Ilu<>::build() + .with_factorization_factory(fact) + .on(exec); + return std::unique_ptr( + new ReferenceFactoryWrapper(f)); + }}, + {"ilu", + [](std::shared_ptr exec) { + auto fact = std::shared_ptr( + gko::factorization::Ilu<>::build().on(exec)); + std::shared_ptr f = + gko::preconditioner::Ilu<>::build() + .with_factorization_factory(fact) + .on(exec); + return std::unique_ptr( + new ReferenceFactoryWrapper(f)); + }}, + {"overhead", [](std::shared_ptr exec) { + std::shared_ptr f = + gko::Overhead<>::build().on(exec); + return std::unique_ptr( + new ReferenceFactoryWrapper(f)); }}}; @@ -244,8 +328,10 @@ void solve_system(const std::string &solver_name, rapidjson::Value(rapidjson::kArrayType), allocator); add_or_set_member(solver_json, "true_residuals", rapidjson::Value(rapidjson::kArrayType), allocator); - if (FLAGS_nrhs == 1) { - auto rhs_norm = compute_norm(lend(b)); + add_or_set_member(solver_json, "iteration_timestamps", + rapidjson::Value(rapidjson::kArrayType), allocator); + if (FLAGS_nrhs == 1 && !FLAGS_overhead) { + auto rhs_norm = compute_norm2(lend(b)); add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator); } for (auto stage : {"generate", "apply"}) { @@ -258,21 +344,28 @@ void solve_system(const std::string &solver_name, } // warm run + auto it_logger = std::make_shared(exec); for (unsigned int i = 0; i < FLAGS_warmup; i++) { auto x_clone = clone(x); auto precond = precond_factory.at(precond_name)(exec); auto solver = solver_factory.at(solver_name)(exec, give(precond)) ->generate(system_matrix); + solver->add_logger(it_logger); solver->apply(lend(b), lend(x_clone)); exec->synchronize(); + solver->remove_logger(gko::lend(it_logger)); + } + if (FLAGS_warmup > 0) { + it_logger->write_data(solver_json["apply"], allocator); } // detail run - if (FLAGS_detailed) { + if (FLAGS_detailed && !FLAGS_overhead) { // slow run, get the time of each functions auto x_clone = clone(x); - auto gen_logger = std::make_shared(exec); + auto gen_logger = + std::make_shared(exec, FLAGS_nested_names); exec->add_logger(gen_logger); auto precond = precond_factory.at(precond_name)(exec); @@ -293,7 +386,8 @@ void solve_system(const std::string &solver_name, solver_json["preconditioner"], allocator); } - auto apply_logger = std::make_shared(exec); + auto apply_logger = + std::make_shared(exec, FLAGS_nested_names); exec->add_logger(apply_logger); solver->apply(lend(b), lend(x_clone)); @@ -308,7 +402,8 @@ void solve_system(const std::string &solver_name, auto res_logger = std::make_shared>( exec, lend(system_matrix), b, solver_json["recurrent_residuals"], - solver_json["true_residuals"], allocator); + solver_json["true_residuals"], + solver_json["iteration_timestamps"], allocator); solver->add_logger(res_logger); solver->apply(lend(b), lend(x_clone)); } @@ -344,7 +439,8 @@ void solve_system(const std::string &solver_name, apply_time += std::chrono::duration_cast( a_tac - a_tic); - if (FLAGS_nrhs == 1 && i == FLAGS_repetitions - 1) { + if (FLAGS_nrhs == 1 && i == FLAGS_repetitions - 1 && + !FLAGS_overhead) { auto residual = compute_residual_norm(lend(system_matrix), lend(b), lend(x_clone)); add_or_set_member(solver_json, "residual_norm", residual, @@ -406,9 +502,18 @@ int main(int argc, char *argv[]) } } - rapidjson::IStreamWrapper jcin(std::cin); rapidjson::Document test_cases; - test_cases.ParseStream(jcin); + if (!FLAGS_overhead) { + rapidjson::IStreamWrapper jcin(std::cin); + test_cases.ParseStream(jcin); + } else { + // Fake test case to run once + auto overhead_json = std::string() + + " [{\"filename\": \"overhead.mtx\", \"optimal\": " + "{ \"spmv\": \"csr\"}}]"; + test_cases.Parse(overhead_json.c_str()); + } + if (!test_cases.IsArray()) { print_config_error_and_exit(); } @@ -435,15 +540,26 @@ int main(int argc, char *argv[]) } std::clog << "Running test case: " << test_case << std::endl; std::ifstream mtx_fd(test_case["filename"].GetString()); - auto data = gko::read_raw(mtx_fd); - auto system_matrix = share(formats::matrix_factory.at( - test_case["optimal"]["spmv"].GetString())(exec, data)); - auto b = create_matrix( - exec, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}, - engine); - auto x = create_matrix( - exec, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}); + using Vec = gko::matrix::Dense<>; + std::shared_ptr system_matrix; + std::unique_ptr b; + std::unique_ptr x; + if (FLAGS_overhead) { + system_matrix = gko::initialize({1.0}, exec); + b = gko::initialize({std::nan("")}, exec); + x = gko::initialize({0.0}, exec); + } else { + auto data = gko::read_raw(mtx_fd); + system_matrix = share(formats::matrix_factory.at( + test_case["optimal"]["spmv"].GetString())(exec, data)); + b = create_matrix( + exec, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}, + engine); + x = create_matrix( + exec, + gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}); + } std::clog << "Matrix is of size (" << system_matrix->get_size()[0] << ", " << system_matrix->get_size()[1] << ")" @@ -467,5 +583,5 @@ int main(int argc, char *argv[]) } } - std::cout << test_cases; + std::cout << test_cases << std::endl; } diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt index 13e637097cf..222d3f750b4 100644 --- a/benchmark/spmv/CMakeLists.txt +++ b/benchmark/spmv/CMakeLists.txt @@ -3,3 +3,6 @@ target_link_libraries(spmv ginkgo gflags rapidjson) if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusp_linops(spmv) endif() +if (GINKGO_BUILD_HIP) + ginkgo_benchmark_hipsp_linops(spmv) +endif() diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index 69a3a9e90e0..07debcf9426 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -61,7 +61,8 @@ DEFINE_uint32(nrhs, 1, "The number of right hand sides"); // calling it void apply_spmv(const char *format_name, std::shared_ptr exec, const gko::matrix_data &data, const vec *b, - const vec *x, rapidjson::Value &test_case, + const vec *x, const vec *answer, + rapidjson::Value &test_case, rapidjson::MemoryPoolAllocator<> &allocator) { try { @@ -73,8 +74,20 @@ void apply_spmv(const char *format_name, std::shared_ptr exec, exec->add_logger(storage_logger); auto system_matrix = share(formats::matrix_factory.at(format_name)(exec, data)); + exec->remove_logger(gko::lend(storage_logger)); storage_logger->write_data(spmv_case[format_name], allocator); + // check the residual + if (FLAGS_detailed) { + auto x_clone = clone(x); + exec->synchronize(); + system_matrix->apply(lend(b), lend(x_clone)); + exec->synchronize(); + double max_relative_norm2 = + compute_max_relative_norm2(lend(x_clone), lend(answer)); + add_or_set_member(spmv_case[format_name], "max_relative_norm2", + max_relative_norm2, allocator); + } // warm run for (unsigned int i = 0; i < FLAGS_warmup; i++) { auto x_clone = clone(x); @@ -172,9 +185,20 @@ int main(int argc, char *argv[]) rapidjson::Value(rapidjson::kObjectType), allocator); } + + // Compute the result from ginkgo::coo as the correct answer + auto answer = vec::create(exec); + if (FLAGS_detailed) { + auto system_matrix = + share(formats::matrix_factory.at("coo")(exec, data)); + answer->copy_from(lend(x)); + exec->synchronize(); + system_matrix->apply(lend(b), lend(answer)); + exec->synchronize(); + } for (const auto &format_name : formats) { apply_spmv(format_name.c_str(), exec, data, lend(b), lend(x), - test_case, allocator); + lend(answer), test_case, allocator); std::clog << "Current state:" << std::endl << test_cases << std::endl; if (spmv_case[format_name.c_str()]["completed"].GetBool()) { @@ -199,5 +223,5 @@ int main(int argc, char *argv[]) } } - std::cout << test_cases; + std::cout << test_cases << std::endl; } diff --git a/benchmark/utils/cuda_linops.hpp b/benchmark/utils/cuda_linops.hpp index 105e0a3f4d5..7762a2439d2 100644 --- a/benchmark/utils/cuda_linops.hpp +++ b/benchmark/utils/cuda_linops.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -37,14 +37,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + +#include #include #include -#include #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/device_guard.hpp" #include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/types.hpp" namespace detail { @@ -54,7 +58,12 @@ class CuspBase : public gko::LinOp { public: cusparseMatDescr_t get_descr() const { return this->descr_.get(); } - const gko::CudaExecutor *get_gpu_exec() const { return gpu_exec_.get(); } + // Return shared pointer not plain pointer such that CuspGenericSpMV uses + // gko::Array to allocate buffer. + std::shared_ptr get_gpu_exec() const + { + return gpu_exec_; + } protected: void apply_impl(const gko::LinOp *, const gko::LinOp *, const gko::LinOp *, @@ -91,11 +100,11 @@ class CuspBase : public gko::LinOp { void initialize_descr() { const auto id = this->gpu_exec_->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; this->descr_ = handle_manager( gko::kernels::cuda::cusparse::create_mat_descr(), [id](cusparseMatDescr_t descr) { - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; gko::kernels::cuda::cusparse::destroy(descr); }); } @@ -141,7 +150,7 @@ class CuspCsrmp auto dx = dense_x->get_values(); const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; gko::kernels::cuda::cusparse::spmv_mp( this->get_gpu_exec()->get_cusparse_handle(), trans_, this->get_size()[0], this->get_size()[1], @@ -201,7 +210,7 @@ class CuspCsr auto dx = dense_x->get_values(); const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; gko::kernels::cuda::cusparse::spmv( this->get_gpu_exec()->get_cusparse_handle(), trans_, this->get_size()[0], this->get_size()[1], @@ -261,7 +270,7 @@ class CuspCsrmm auto dx = dense_x->get_values(); const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; gko::kernels::cuda::cusparse::spmm( this->get_gpu_exec()->get_cusparse_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], @@ -318,7 +327,7 @@ class CuspCsrEx const auto id = this->get_gpu_exec()->get_device_id(); if (set_buffer_) { try { - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; GKO_ASSERT_NO_CUDA_ERRORS(cudaFree(buffer_)); } catch (const std::exception &e) { std::cerr @@ -344,7 +353,7 @@ class CuspCsrEx gko::size_type buffer_size = 0; const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; auto handle = this->get_gpu_exec()->get_cusparse_handle(); // This function seems to require the pointer mode to be set to HOST. // Ginkgo use pointer mode DEVICE by default, so we change this @@ -416,7 +425,7 @@ class CuspHybrid this->set_size(gko::dim<2>{t_csr->get_size()}); const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; gko::kernels::cuda::cusparse::csr2hyb( this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), @@ -428,7 +437,7 @@ class CuspHybrid { const auto id = this->get_gpu_exec()->get_device_id(); try { - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_)); } catch (const std::exception &e) { std::cerr << "Error when unallocating CuspHybrid hyb_ matrix: " @@ -449,7 +458,7 @@ class CuspHybrid auto dx = dense_x->get_values(); const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; gko::kernels::cuda::cusparse::spmv( this->get_gpu_exec()->get_cusparse_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, @@ -462,7 +471,7 @@ class CuspHybrid trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) { const auto id = this->get_gpu_exec()->get_device_id(); - gko::device_guard g{id}; + gko::cuda::device_guard g{id}; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_)); } @@ -475,6 +484,206 @@ class CuspHybrid }; +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ + !(defined(_WIN32) || defined(__CYGWIN__)) + + +template +void cusp_generic_spmv(std::shared_ptr gpu_exec, + const cusparseSpMatDescr_t mat, + const gko::Array &scalars, + const gko::LinOp *b, gko::LinOp *x, + cusparseOperation_t trans, cusparseSpMVAlg_t alg) +{ + cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + using gko::kernels::cuda::as_culibs_type; + auto dense_b = gko::as>(b); + auto dense_x = gko::as>(x); + auto db = dense_b->get_const_values(); + auto dx = dense_x->get_values(); + const auto id = gpu_exec->get_device_id(); + gko::cuda::device_guard g{id}; + cusparseDnVecDescr_t vecb, vecx; + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(), + as_culibs_type(dx), cu_value)); + // cusparseCreateDnVec only allows non-const pointer + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateDnVec( + &vecb, dense_b->get_num_stored_elements(), + as_culibs_type(const_cast(db)), cu_value)); + + size_t buffer_size = 0; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( + gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], + mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, + &buffer_size)); + gko::Array buffer_array(gpu_exec, buffer_size); + auto dbuffer = buffer_array.get_data(); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV( + gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], + mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer)); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx)); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb)); +} + + +template +class CuspGenericCsr + : public gko::EnableLinOp, + CuspBase>, + public gko::EnableCreateMethod>, + public gko::ReadableFromMatrixData { + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; + +public: + using csr = gko::matrix::Csr; + using mat_data = gko::matrix_data; + cusparseIndexType_t cu_index = + gko::kernels::cuda::cusparse_index_type(); + cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + + void read(const mat_data &data) override + { + using gko::kernels::cuda::as_culibs_type; + csr_->read(data); + this->set_size(gko::dim<2>{csr_->get_size()}); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateCsr(&mat_, csr_->get_size()[0], csr_->get_size()[1], + csr_->get_num_stored_elements(), + as_culibs_type(csr_->get_row_ptrs()), + as_culibs_type(csr_->get_col_idxs()), + as_culibs_type(csr_->get_values()), cu_index, + cu_index, CUSPARSE_INDEX_BASE_ZERO, cu_value)); + } + + gko::size_type get_num_stored_elements() const noexcept + { + return csr_->get_num_stored_elements(); + } + + ~CuspGenericCsr() override + { + const auto id = this->get_gpu_exec()->get_device_id(); + try { + gko::cuda::device_guard g{id}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_)); + } catch (const std::exception &e) { + std::cerr << "Error when unallocating CuspGenericCsr mat_ matrix: " + << e.what() << std::endl; + } + } + + CuspGenericCsr(const CuspGenericCsr &other) = delete; + + CuspGenericCsr &operator=(const CuspGenericCsr &other) = default; + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override + { + cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, + Alg); + } + + CuspGenericCsr(std::shared_ptr exec, + const gko::dim<2> &size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), + csr_(std::move( + csr::create(exec, std::make_shared()))), + trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + {} + +private: + // Contains {alpha, beta} + gko::Array scalars{ + this->get_executor(), {gko::one(), gko::zero()}}; + std::shared_ptr csr_; + cusparseOperation_t trans_; + cusparseSpMatDescr_t mat_; +}; + + +template +class CuspGenericCoo + : public gko::EnableLinOp, CuspBase>, + public gko::EnableCreateMethod>, + public gko::ReadableFromMatrixData { + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; + +public: + using coo = gko::matrix::Coo; + using mat_data = gko::matrix_data; + cusparseIndexType_t cu_index = + gko::kernels::cuda::cusparse_index_type(); + cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + + void read(const mat_data &data) override + { + using gko::kernels::cuda::as_culibs_type; + coo_->read(data); + this->set_size(gko::dim<2>{coo_->get_size()}); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateCoo(&mat_, coo_->get_size()[0], coo_->get_size()[1], + coo_->get_num_stored_elements(), + as_culibs_type(coo_->get_row_idxs()), + as_culibs_type(coo_->get_col_idxs()), + as_culibs_type(coo_->get_values()), cu_index, + CUSPARSE_INDEX_BASE_ZERO, cu_value)); + } + + gko::size_type get_num_stored_elements() const noexcept + { + return coo_->get_num_stored_elements(); + } + + ~CuspGenericCoo() override + { + const auto id = this->get_gpu_exec()->get_device_id(); + try { + gko::cuda::device_guard g{id}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_)); + } catch (const std::exception &e) { + std::cerr << "Error when unallocating CuspGenericCoo mat_ matrix: " + << e.what() << std::endl; + } + } + + CuspGenericCoo(const CuspGenericCoo &other) = delete; + + CuspGenericCoo &operator=(const CuspGenericCoo &other) = default; + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override + { + cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, + CUSPARSE_MV_ALG_DEFAULT); + } + + CuspGenericCoo(std::shared_ptr exec, + const gko::dim<2> &size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), + coo_(std::move(coo::create(exec))), + trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + {} + +private: + // Contains {alpha, beta} + gko::Array scalars{ + this->get_executor(), {gko::one(), gko::zero()}}; + std::shared_ptr coo_; + cusparseOperation_t trans_; + cusparseSpMatDescr_t mat_; +}; + + +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && + // !(defined(_WIN32) || defined(__CYGWIN__)) + + } // namespace detail @@ -485,6 +694,20 @@ using cusp_csrmp = detail::CuspCsrmp<>; using cusp_csrmm = detail::CuspCsrmm<>; +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ + !(defined(_WIN32) || defined(__CYGWIN__)) + + +using cusp_gcsr = detail::CuspGenericCsr<>; +using cusp_gcsr2 = + detail::CuspGenericCsr; +using cusp_gcoo = detail::CuspGenericCoo<>; + + +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && + // !(defined(_WIN32) || defined(__CYGWIN__)) + + using cusp_coo = detail::CuspHybrid; using cusp_ell = diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index aa757030017..c4379f834f7 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45,8 +45,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HAS_CUDA -#include "cuda_linops.hpp" +#include "benchmark/utils/cuda_linops.hpp" #endif // HAS_CUDA +#ifdef HAS_HIP +#include "benchmark/utils/hip_linops.hip.hpp" +#endif // HAS_HIP namespace formats { @@ -60,6 +63,9 @@ std::string available_format = ", cusp_csr, cusp_csrex, cusp_csrmp, cusp_csrmm, cusp_coo, cusp_ell, " "cusp_hybrid" #endif // HAS_CUDA +#ifdef HAS_HIP + ", hipsp_csr, hipsp_csrmm, hipsp_coo, hipsp_ell, hipsp_hybrid" +#endif // HAS_HIP ".\n"; std::string format_description = @@ -91,7 +97,28 @@ std::string format_description = "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function.\n" "cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n" "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function." +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ + !(defined(_WIN32) || defined(__CYGWIN__)) + "\n" + "cusp_gcsr: benchmark CuSPARSE with the generic csr with default " + "algorithm.\n" + "cusp_gcsr2: benchmark CuSPARSE with the generic csr with " + "CUSPARSE_CSRMV_ALG2.\n" + "cusp_gcoo: benchmark CuSPARSE with the generic coo with default " + "algorithm.\n" +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && + // !(defined(_WIN32) || defined(__CYGWIN__)) #endif // HAS_CUDA +#ifdef HAS_HIP + "\n" + "hipsp_csr: benchmark HipSPARSE with the hipsparseXcsrmv function.\n" + "hipsp_csrmm: benchmark HipSPARSE with the hipsparseXcsrmv_mm function.\n" + "hipsp_hybrid: benchmark HipSPARSE spmv with hipsparseXhybmv and an " + "automatic partition.\n" + "hipsp_coo: use hipsparseXhybmv with a HIPSPARSE_HYB_PARTITION_USER " + "partition.\n" + "hipsp_ell: use hipsparseXhybmv with HIPSPARSE_HYB_PARTITION_MAX partition." +#endif // HAS_HIP ; std::string format_command = @@ -148,6 +175,7 @@ std::unique_ptr read_matrix_from_data( } +// clang-format off const std::map( std::shared_ptr, const gko::matrix_data<> &)>> @@ -166,7 +194,21 @@ const std::map( {"cusp_hybrid", read_matrix_from_data}, {"cusp_coo", read_matrix_from_data}, {"cusp_ell", read_matrix_from_data}, +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ + !(defined(_WIN32) || defined(__CYGWIN__)) + {"cusp_gcsr", read_matrix_from_data}, + {"cusp_gcsr2", read_matrix_from_data}, + {"cusp_gcoo", read_matrix_from_data}, +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && + // !(defined(_WIN32) || defined(__CYGWIN__)) #endif // HAS_CUDA +#ifdef HAS_HIP + {"hipsp_csr", read_matrix_from_data}, + {"hipsp_csrmm", read_matrix_from_data}, + {"hipsp_hybrid", read_matrix_from_data}, + {"hipsp_coo", read_matrix_from_data}, + {"hipsp_ell", read_matrix_from_data}, +#endif // HAS_HIP {"hybrid", read_matrix_from_data}, {"hybrid0", READ_MATRIX(hybrid, std::make_shared(0))}, @@ -194,8 +236,9 @@ const std::map( READ_MATRIX(hybrid, std::make_shared())}, {"sellp", read_matrix_from_data>}}; +// clang-format on } // namespace formats -#endif // GKO_BENCHMARK_UTILS_FORMATS_HPP_ \ No newline at end of file +#endif // GKO_BENCHMARK_UTILS_FORMATS_HPP_ diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 267109f0dcc..2cb738ce1dd 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -57,9 +58,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Global command-line arguments -DEFINE_string( - executor, "reference", - "The executor used to run the benchmarks, one of: reference, omp, cuda"); +DEFINE_string(executor, "reference", + "The executor used to run the benchmarks, one of: reference, " + "omp, cuda, hip"); DEFINE_uint32(device_id, 0, "ID of the device where to run the code"); @@ -79,6 +80,8 @@ DEFINE_string(double_buffer, "", DEFINE_bool(detailed, true, "If set, performs several runs to obtain more detailed results"); +DEFINE_bool(nested_names, false, "If set, separately logs nested operations"); + DEFINE_uint32(seed, 42, "Seed used for the random number generator"); DEFINE_uint32(warmup, 2, "Warm-up repetitions"); @@ -167,7 +170,10 @@ std::ranlux24 &get_engine() std::ostream &operator<<(std::ostream &os, const rapidjson::Value &value) { rapidjson::OStreamWrapper jos(os); - rapidjson::PrettyWriter writer(jos); + rapidjson::PrettyWriter, + rapidjson::UTF8<>, rapidjson::CrtAllocator, + rapidjson::kWriteNanAndInfFlag> + writer(jos); value.Accept(writer); return os; } @@ -251,9 +257,14 @@ const std::map()>> executor_factory{ {"reference", [] { return gko::ReferenceExecutor::create(); }}, {"omp", [] { return gko::OmpExecutor::create(); }}, - {"cuda", [] { + {"cuda", + [] { return gko::CudaExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create()); + gko::OmpExecutor::create(), true); + }}, + {"hip", [] { + return gko::HipExecutor::create(FLAGS_device_id, + gko::OmpExecutor::create(), true); }}}; @@ -322,7 +333,7 @@ double get_norm(const vec *norm) template -double compute_norm(const vec *b) +double compute_norm2(const vec *b) { auto exec = b->get_executor(); auto b_norm = gko::initialize>({0.0}, exec); @@ -340,8 +351,35 @@ double compute_residual_norm(const gko::LinOp *system_matrix, auto neg_one = gko::initialize>({-1.0}, exec); auto res = clone(b); system_matrix->apply(lend(one), lend(x), lend(neg_one), lend(res)); - return compute_norm(lend(res)); + return compute_norm2(lend(res)); +} + + +template +double compute_max_relative_norm2(vec *result, + const vec *answer) +{ + auto exec = answer->get_executor(); + auto answer_norm = + vec::create(exec, gko::dim<2>{1, answer->get_size()[1]}); + answer->compute_norm2(lend(answer_norm)); + auto neg_one = gko::initialize>({-1.0}, exec); + result->add_scaled(lend(neg_one), lend(answer)); + auto absolute_norm = + vec::create(exec, gko::dim<2>{1, answer->get_size()[1]}); + result->compute_norm2(lend(absolute_norm)); + auto host_answer_norm = + clone(answer_norm->get_executor()->get_master(), answer_norm); + auto host_absolute_norm = + clone(absolute_norm->get_executor()->get_master(), absolute_norm); + double max_relative_norm2 = 0; + for (gko::size_type i = 0; i < host_answer_norm->get_size()[1]; i++) { + max_relative_norm2 = + std::max(host_absolute_norm->at(0, i) / host_answer_norm->at(0, i), + max_relative_norm2); + } + return max_relative_norm2; } -#endif // GKO_BENCHMARK_UTILS_GENERAL_HPP_ \ No newline at end of file +#endif // GKO_BENCHMARK_UTILS_GENERAL_HPP_ diff --git a/benchmark/utils/hip_linops.hip.hpp b/benchmark/utils/hip_linops.hip.hpp new file mode 100644 index 00000000000..5d62d605d24 --- /dev/null +++ b/benchmark/utils/hip_linops.hip.hpp @@ -0,0 +1,334 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_HIP_LINOPS_HIP_HPP_ +#define GKO_BENCHMARK_UTILS_HIP_LINOPS_HIP_HPP_ + + +#include + + +#include + + +#include + + +#include "hip/base/device_guard.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" + + +namespace detail { + + +struct hipsparseMatDescr; + + +class HipspBase : public gko::LinOp { +public: + hipsparseMatDescr_t get_descr() const { return this->descr_.get(); } + + const gko::HipExecutor *get_gpu_exec() const { return gpu_exec_.get(); } + +protected: + void apply_impl(const gko::LinOp *, const gko::LinOp *, const gko::LinOp *, + gko::LinOp *) const override + { + GKO_NOT_IMPLEMENTED; + } + + HipspBase(std::shared_ptr exec, + const gko::dim<2> &size = gko::dim<2>{}) + : gko::LinOp(exec, size) + { + gpu_exec_ = std::dynamic_pointer_cast(exec); + if (gpu_exec_ == nullptr) { + GKO_NOT_IMPLEMENTED; + } + this->initialize_descr(); + } + + ~HipspBase() = default; + + HipspBase(const HipspBase &other) = delete; + + HipspBase &operator=(const HipspBase &other) + { + if (this != &other) { + gko::LinOp::operator=(other); + this->gpu_exec_ = other.gpu_exec_; + this->initialize_descr(); + } + return *this; + } + + void initialize_descr() + { + const auto id = this->gpu_exec_->get_device_id(); + gko::hip::device_guard g{id}; + this->descr_ = handle_manager( + reinterpret_cast( + gko::kernels::hip::hipsparse::create_mat_descr()), + [id](hipsparseMatDescr *descr) { + gko::hip::device_guard g{id}; + gko::kernels::hip::hipsparse::destroy(descr); + }); + } + +private: + std::shared_ptr gpu_exec_; + template + using handle_manager = std::unique_ptr>; + handle_manager descr_; +}; + + +template +class HipspCsr + : public gko::EnableLinOp, HipspBase>, + public gko::EnableCreateMethod>, + public gko::ReadableFromMatrixData { + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; + +public: + using csr = gko::matrix::Csr; + using mat_data = gko::matrix_data; + + void read(const mat_data &data) override + { + csr_->read(data); + this->set_size(gko::dim<2>{csr_->get_size()}); + } + + gko::size_type get_num_stored_elements() const noexcept + { + return csr_->get_num_stored_elements(); + } + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override + { + auto dense_b = gko::as>(b); + auto dense_x = gko::as>(x); + auto db = dense_b->get_const_values(); + auto dx = dense_x->get_values(); + + const auto id = this->get_gpu_exec()->get_device_id(); + gko::hip::device_guard g{id}; + gko::kernels::hip::hipsparse::spmv( + this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_size()[0], this->get_size()[1], + csr_->get_num_stored_elements(), &scalars.get_const_data()[0], + this->get_descr(), csr_->get_const_values(), + csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db, + &scalars.get_const_data()[1], dx); + } + + HipspCsr(std::shared_ptr exec, + const gko::dim<2> &size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), + csr_(std::move( + csr::create(exec, std::make_shared()))), + trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + {} + +private: + // Contains {alpha, beta} + gko::Array scalars{ + this->get_executor(), {gko::one(), gko::zero()}}; + std::shared_ptr csr_; + hipsparseOperation_t trans_; +}; + + +template +class HipspCsrmm + : public gko::EnableLinOp, HipspBase>, + public gko::EnableCreateMethod>, + public gko::ReadableFromMatrixData { + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; + +public: + using csr = gko::matrix::Csr; + using mat_data = gko::matrix_data; + + void read(const mat_data &data) override + { + csr_->read(data); + this->set_size(gko::dim<2>{csr_->get_size()}); + } + + gko::size_type get_num_stored_elements() const noexcept + { + return csr_->get_num_stored_elements(); + } + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override + { + auto dense_b = gko::as>(b); + auto dense_x = gko::as>(x); + auto db = dense_b->get_const_values(); + auto dx = dense_x->get_values(); + + const auto id = this->get_gpu_exec()->get_device_id(); + gko::hip::device_guard g{id}; + gko::kernels::hip::hipsparse::spmm( + this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], + csr_->get_num_stored_elements(), &scalars.get_const_data()[0], + this->get_descr(), csr_->get_const_values(), + csr_->get_const_row_ptrs(), csr_->get_const_col_idxs(), db, + dense_b->get_size()[0], &scalars.get_const_data()[1], dx, + dense_x->get_size()[0]); + } + + HipspCsrmm(std::shared_ptr exec, + const gko::dim<2> &size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), + csr_(std::move( + csr::create(exec, std::make_shared()))), + trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + {} + +private: + // Contains {alpha, beta} + gko::Array scalars{ + this->get_executor(), {gko::one(), gko::zero()}}; + std::shared_ptr csr_; + hipsparseOperation_t trans_; +}; + + +template +class HipspHybrid + : public gko::EnableLinOp< + HipspHybrid, HipspBase>, + public gko::EnableCreateMethod< + HipspHybrid>, + public gko::ReadableFromMatrixData { + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; + +public: + using csr = gko::matrix::Csr; + using mat_data = gko::matrix_data; + + void read(const mat_data &data) override + { + auto t_csr = csr::create(this->get_executor(), + std::make_shared()); + t_csr->read(data); + this->set_size(gko::dim<2>{t_csr->get_size()}); + + const auto id = this->get_gpu_exec()->get_device_id(); + gko::hip::device_guard g{id}; + gko::kernels::hip::hipsparse::csr2hyb( + this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0], + this->get_size()[1], this->get_descr(), t_csr->get_const_values(), + t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_, + Threshold, Partition); + } + + ~HipspHybrid() override + { + const auto id = this->get_gpu_exec()->get_device_id(); + try { + gko::hip::device_guard g{id}; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyHybMat(hyb_)); + } catch (const std::exception &e) { + std::cerr << "Error when unallocating HipspHybrid hyb_ matrix: " + << e.what() << std::endl; + } + } + + HipspHybrid(const HipspHybrid &other) = delete; + + HipspHybrid &operator=(const HipspHybrid &other) = default; + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override + { + auto dense_b = gko::as>(b); + auto dense_x = gko::as>(x); + auto db = dense_b->get_const_values(); + auto dx = dense_x->get_values(); + + const auto id = this->get_gpu_exec()->get_device_id(); + gko::hip::device_guard g{id}; + gko::kernels::hip::hipsparse::spmv( + this->get_gpu_exec()->get_hipsparse_handle(), trans_, + &scalars.get_const_data()[0], this->get_descr(), hyb_, db, + &scalars.get_const_data()[1], dx); + } + + HipspHybrid(std::shared_ptr exec, + const gko::dim<2> &size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), + trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + { + const auto id = this->get_gpu_exec()->get_device_id(); + gko::hip::device_guard g{id}; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_)); + } + +private: + // Contains {alpha, beta} + gko::Array scalars{ + this->get_executor(), {gko::one(), gko::zero()}}; + hipsparseOperation_t trans_; + hipsparseHybMat_t hyb_; +}; + + +} // namespace detail + + +// Some shortcuts +using hipsp_csr = detail::HipspCsr<>; +using hipsp_csrmm = detail::HipspCsrmm<>; + + +using hipsp_coo = + detail::HipspHybrid; +using hipsp_ell = + detail::HipspHybrid; +using hipsp_hybrid = detail::HipspHybrid<>; + +#endif // GKO_BENCHMARK_UTILS_HIP_LINOPS_HIP_HPP_ diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp index 69ec16bd769..ea6bbea2797 100644 --- a/benchmark/utils/loggers.hpp +++ b/benchmark/utils/loggers.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -104,6 +105,7 @@ struct OperationLogger : gko::log::Logger { rapidjson::MemoryPoolAllocator<> &alloc, gko::uint32 repetitions) { + const std::lock_guard lock(mutex); for (const auto &entry : total) { add_or_set_member( object, entry.first.c_str(), @@ -115,37 +117,48 @@ struct OperationLogger : gko::log::Logger { } } - OperationLogger(std::shared_ptr exec) - : gko::log::Logger(exec) + OperationLogger(std::shared_ptr exec, bool nested_name) + : gko::log::Logger(exec), use_nested_name{nested_name} {} private: void start_operation(const gko::Executor *exec, const std::string &name) const { - nested.emplace_back(0); exec->synchronize(); - start[name] = std::chrono::steady_clock::now(); + const std::lock_guard lock(mutex); + auto nested_name = nested.empty() || !use_nested_name + ? name + : nested.back().first + "::" + name; + nested.emplace_back(nested_name, std::chrono::steady_clock::duration{}); + start[nested_name] = std::chrono::steady_clock::now(); } void end_operation(const gko::Executor *exec, const std::string &name) const { exec->synchronize(); + const std::lock_guard lock(mutex); + // if operations are properly nested, nested_name now ends with name + auto nested_name = nested.back().first; const auto end = std::chrono::steady_clock::now(); - const auto diff = end - start[name]; + const auto diff = end - start[nested_name]; // make sure timings for nested operations are not counted twice - total[name] += diff - nested.back(); + total[nested_name] += diff - nested.back().second; nested.pop_back(); - if (nested.size() > 0) { - nested.back() += diff; + if (!nested.empty()) { + nested.back().second += diff; } } + bool use_nested_name; + mutable std::mutex mutex; mutable std::map start; mutable std::map total; // the position i of this vector holds the total time spend on child // operations on nesting level i - mutable std::vector nested; + mutable std::vector< + std::pair> + nested; }; @@ -154,18 +167,21 @@ struct StorageLogger : gko::log::Logger { const gko::size_type &num_bytes, const gko::uintptr &location) const override { + const std::lock_guard lock(mutex); storage[location] = num_bytes; } void on_free_completed(const gko::Executor *, const gko::uintptr &location) const override { + const std::lock_guard lock(mutex); storage[location] = 0; } void write_data(rapidjson::Value &output, rapidjson::MemoryPoolAllocator<> &allocator) { + const std::lock_guard lock(mutex); gko::size_type total{}; for (const auto &e : storage) { total += e.second; @@ -178,6 +194,7 @@ struct StorageLogger : gko::log::Logger { {} private: + mutable std::mutex mutex; mutable std::unordered_map storage; }; @@ -190,12 +207,17 @@ struct ResidualLogger : gko::log::Logger { const gko::LinOp *solution, const gko::LinOp *residual_norm) const override { + timestamps.PushBack( + std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(), + alloc); if (residual_norm) { rec_res_norms.PushBack( get_norm(gko::as>(residual_norm)), alloc); } else { rec_res_norms.PushBack( - compute_norm(gko::as>(residual)), alloc); + compute_norm2(gko::as>(residual)), alloc); } if (solution) { true_res_norms.PushBack( @@ -211,22 +233,52 @@ struct ResidualLogger : gko::log::Logger { const gko::LinOp *matrix, const vec *b, rapidjson::Value &rec_res_norms, rapidjson::Value &true_res_norms, + rapidjson::Value ×tamps, rapidjson::MemoryPoolAllocator<> &alloc) : gko::log::Logger(exec, gko::log::Logger::iteration_complete_mask), matrix{matrix}, b{b}, + start{std::chrono::steady_clock::now()}, rec_res_norms{rec_res_norms}, true_res_norms{true_res_norms}, + timestamps{timestamps}, alloc{alloc} {} private: const gko::LinOp *matrix; const vec *b; + std::chrono::steady_clock::time_point start; rapidjson::Value &rec_res_norms; rapidjson::Value &true_res_norms; + rapidjson::Value ×tamps; rapidjson::MemoryPoolAllocator<> &alloc; }; +// Logs the number of iteration executed +struct IterationLogger : gko::log::Logger { + void on_iteration_complete(const gko::LinOp *, + const gko::size_type &num_iterations, + const gko::LinOp *, const gko::LinOp *, + const gko::LinOp *) const override + { + this->num_iters = num_iterations; + } + + IterationLogger(std::shared_ptr exec) + : gko::log::Logger(exec, gko::log::Logger::iteration_complete_mask) + {} + + void write_data(rapidjson::Value &output, + rapidjson::MemoryPoolAllocator<> &allocator) + { + add_or_set_member(output, "iterations", this->num_iters, allocator); + } + +private: + mutable gko::size_type num_iters{0}; +}; + + #endif // GKO_BENCHMARK_UTILS_LOGGERS_HPP_ diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp new file mode 100644 index 00000000000..8db715e553f --- /dev/null +++ b/benchmark/utils/overhead_linop.hpp @@ -0,0 +1,226 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_OVERHEAD_LINOP_HPP_ +#define GKO_BENCHMARK_UTILS_OVERHEAD_LINOP_HPP_ + + +#include +#include +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace overhead { + + +#define GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(_type, _num) \ + static volatile std::uintptr_t val_operation_##_num = 0; \ + template \ + void operation##_num(std::shared_ptr exec, \ + const matrix::Dense<_type> *b, \ + matrix::Dense<_type> *x) \ + { \ + val_operation_##_num = reinterpret_cast(x); \ + } + + +#define GKO_DECLARE_ALL \ + GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 1) \ + GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 2) \ + GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 3) \ + GKO_DECLARE_OVERHEAD_OPERATION_KERNEL(ValueType, 4) \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + +} // namespace overhead + + +namespace omp { +namespace overhead { + +GKO_DECLARE_ALL; + +} // namespace overhead +} // namespace omp + + +namespace cuda { +namespace overhead { + +GKO_DECLARE_ALL; + +} // namespace overhead +} // namespace cuda + + +namespace reference { +namespace overhead { + +GKO_DECLARE_ALL; + +} // namespace overhead +} // namespace reference + + +namespace hip { +namespace overhead { + +GKO_DECLARE_ALL; + +} // namespace overhead +} // namespace hip + + +#undef GKO_DECLARE_ALL + + +} // namespace kernels + + +namespace overhead { + + +GKO_REGISTER_OPERATION(operation1, overhead::operation1); +GKO_REGISTER_OPERATION(operation2, overhead::operation2); +GKO_REGISTER_OPERATION(operation3, overhead::operation3); +GKO_REGISTER_OPERATION(operation4, overhead::operation4); + + +} // namespace overhead + + +template +class Overhead : public EnableLinOp>, + public Preconditionable { + friend class EnableLinOp; + friend class EnablePolymorphicObject; + +public: + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * Criterion factories. + */ + std::vector> + GKO_FACTORY_PARAMETER(criteria, nullptr); + + /** + * Preconditioner factory. + */ + std::shared_ptr GKO_FACTORY_PARAMETER( + preconditioner, nullptr); + + /** + * Already generated preconditioner. If one is provided, the factory + * `preconditioner` will be ignored. + */ + std::shared_ptr GKO_FACTORY_PARAMETER( + generated_preconditioner, nullptr); + }; + + GKO_ENABLE_LIN_OP_FACTORY(Overhead, parameters, Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + void apply_impl(const LinOp *b, LinOp *x) const override + { + using Vector = matrix::Dense; + + auto exec = this->get_executor(); + auto dense_b = as(b); + auto dense_x = as(x); + + system_matrix_->apply(dense_b, dense_x); + get_preconditioner()->apply(dense_b, dense_x); + + exec->run(overhead::make_operation1(dense_b, dense_x)); + exec->run(overhead::make_operation2(dense_b, dense_x)); + exec->run(overhead::make_operation3(dense_b, dense_x)); + exec->run(overhead::make_operation4(dense_b, dense_x)); + } + + void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, + LinOp *x) const override + { + auto dense_x = as>(x); + + auto x_clone = dense_x->clone(); + this->apply(b, x_clone.get()); + dense_x->scale(beta); + dense_x->add_scaled(alpha, x_clone.get()); + } + + explicit Overhead(std::shared_ptr exec) + : EnableLinOp(std::move(exec)) + {} + + explicit Overhead(const Factory *factory, + std::shared_ptr system_matrix) + : EnableLinOp(factory->get_executor(), + transpose(system_matrix->get_size())), + parameters_{factory->get_parameters()}, + system_matrix_{std::move(system_matrix)} + { + if (parameters_.generated_preconditioner) { + GKO_ASSERT_EQUAL_DIMENSIONS(parameters_.generated_preconditioner, + this); + set_preconditioner(parameters_.generated_preconditioner); + } else if (parameters_.preconditioner) { + set_preconditioner( + parameters_.preconditioner->generate(system_matrix_)); + } else { + set_preconditioner(matrix::Identity::create( + this->get_executor(), this->get_size()[0])); + } + stop_criterion_factory_ = + stop::combine(std::move(parameters_.criteria)); + } + +private: + std::shared_ptr system_matrix_{}; + std::shared_ptr stop_criterion_factory_{}; +}; + + +} // namespace gko + + +#endif // GKO_BENCHMARK_UTILS_OVERHEAD_LINOP_HPP_ diff --git a/benchmark/utils/spmv_common.hpp b/benchmark/utils/spmv_common.hpp index f027d52c0ce..34cd51067ae 100644 --- a/benchmark/utils/spmv_common.hpp +++ b/benchmark/utils/spmv_common.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ #define GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ + #include @@ -50,7 +51,8 @@ using csr = gko::matrix::Csr<>; /** * Function which outputs the input format for benchmarks similar to the spmv. */ -[[noreturn]] void print_config_error_and_exit() { +[[noreturn]] void print_config_error_and_exit() +{ std::cerr << "Input has to be a JSON array of matrix configurations:\n" << " [\n" << " { \"filename\": \"my_file.mtx\" },\n" @@ -74,4 +76,4 @@ void validate_option_object(const rapidjson::Value &value) } -#endif // GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ \ No newline at end of file +#endif // GKO_BENCHMARK_UTILS_SPMV_COMMON_HPP_ diff --git a/cmake/CTestCustom.cmake.in b/cmake/CTestCustom.cmake.in index afcaf60fb28..1b26cadf225 100644 --- a/cmake/CTestCustom.cmake.in +++ b/cmake/CTestCustom.cmake.in @@ -5,15 +5,15 @@ list(APPEND CTEST_CUSTOM_COVERAGE_EXCLUDE # Exclude try_compile sources from coverage results: "/CMakeFiles/CMakeTmp/" - "third_party" + ".*/third_party/.*" - "test" + ".*/doc/.*" - "benchmark" + ".*/benchmark/.*" - "examples" + ".*/examples/.*" - "c\\+\\+" + ".*/c\\+\\+/.*" ) set(CTEST_SOURCE_DIRECTORY "@Ginkgo_SOURCE_DIR@" CACHE STRING "" FORCE) diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake index ff4828d01ce..27f03bce996 100644 --- a/cmake/CTestScript.cmake +++ b/cmake/CTestScript.cmake @@ -8,13 +8,17 @@ # CDash dashboard. The supported runs are: # + With or without coverage, requires the gcov tool. # + With or without address sanitizers. +# + With or without memory sanitizers. # + With or without thread sanitizers. +# + With or without leak sanitizers. +# + With or without undefined behavior (UB) sanitizers. # + With or without valgrind, requires the valgrind tool. # # Note that only one of these can be ran at once, as the build types -# conflict. Ginkgo is always configured with CUDA, OpenMP and Reference -# support. The results are always sent to the dashboard: -# https://my.cdash.org/index.php?project=Ginkgo+Project +# conflict. Ginkgo is always configured with CUDA, HIP, OpenMP and Reference +# support, except for ThreadSanitizer, AddressSanitizer, LeakSanitizer, +# UndefinedBehaviorSanitizer builds. The results are always sent to the +# dashboard: https://my.cdash.org/index.php?project=Ginkgo+Project # # Running the script # ^^^^^^^^^^^^^^^^^^ @@ -46,11 +50,13 @@ # A string to describe the machine this is ran on. Default FineCI. # # ``CTEST_CMAKE_GENERATOR`` -# Which generator should be used for the build. Default `Unix Makefiles` +# Which generator should be used for the build. Default `Ninja`, except +# for COVERAGE builds where `Unix Makefiles` is used. # # ``CTEST_BUILD_CONFIGURATION`` # Which configuration should Ginkgo be built with. Default `DEBUG`. -# The supported values are: COVERAGE, ASAN, TSAN, DEBUG and RELEASE. +# The supported values are: COVERAGE, TSAN, UBSAN, DEBUG, and +# RELEASE. # # ``CTEST_TEST_MODEL`` # Which CTest test model should be used. Default `Continuous`. @@ -61,8 +67,9 @@ # The name of the build being ran. Default: `CTEST_BUILD_CONFIGURATION` # # ``CTEST_MEMORYCHECK_TYPE`` -# Whether memorycheck should be ran. Default: `None`. Supported values are: -# Valgrind, ThreadSanitizer, AddressSanitizer and None. +# Whether memorycheck should be ran. Default: `NONE`. Supported values are: +# Valgrind, AddressSanitizer, LeakSanitizer, ThreadSanitizer, +# UndefinedBehaviorSanitizer and NONE. # if (NOT DEFINED CTEST_SOURCE_DIRECTORY) @@ -78,10 +85,14 @@ if (NOT DEFINED CTEST_SITE) endif() if (NOT DEFINED CTEST_CMAKE_GENERATOR) - set(CTEST_CMAKE_GENERATOR "Unix Makefiles") + if (CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE") + set(CTEST_CMAKE_GENERATOR "Unix Makefiles") + else() + set(CTEST_CMAKE_GENERATOR "Ninja") + endif() endif() -# Supported: COVERAGE, ASAN, TSAN, DEBUG and RELEASE +# Supported: COVERAGE, ASAN, LSAN, TSAN, UBSAN, DEBUG and RELEASE if (NOT DEFINED CTEST_BUILD_CONFIGURATION) set(CTEST_BUILD_CONFIGURATION "DEBUG") endif() @@ -94,9 +105,10 @@ if (NOT DEFINED CTEST_BUILD_NAME) set(CTEST_BUILD_NAME "${CTEST_BUILD_CONFIGURATION}") endif() -#Supported: Valgrind, ThreadSanitizer, AddressSanitizer. +#Supported: Valgrind, ThreadSanitizer, AddressSanitizer, LeakSanitizer +#and UndefinedBehaviorSanitizer. if (NOT DEFINED CTEST_MEMORYCHECK_TYPE) - set(CTEST_MEMORYCHECK_TYPE "None") + set(CTEST_MEMORYCHECK_TYPE "NONE") endif() # Find coverage and valgrind tools @@ -112,28 +124,39 @@ if(CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE") endif() if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") - set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "verbosity=1") + set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "${CTEST_MEMORYCHECK_SANITIZER_OPTIONS}:allocator_may_return_null=1:verbosity=1") endif() include(ProcessorCount) ProcessorCount(PROC_COUNT) if(NOT PROC_COUNT EQUAL 0) - if (PROC_COUNT GREATER 10) - set(PROCT_COUNT 10) + if (DEFINED ENV{CI_PARALLELISM}) + set(PROC_COUNT "$ENV{CI_PARALLELISM}") + elseif(PROC_COUNT LESS 4) + set(PROC_COUNT 1) + else() + set(PROC_COUNT 4) endif() if(NOT WIN32) set(CTEST_BUILD_FLAGS "-j${PROC_COUNT}") endif(NOT WIN32) endif() + ctest_start("${CTEST_TEST_MODEL}") ctest_submit(PARTS Start) -if(CTEST_MEMORYCHECK_TYPE STREQUAL "AddressSanitizer" OR CTEST_MEMORYCHECK_TYPE STREQUAL "ThreadSanitizer") - set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") +if((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE") + set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") else() - set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") + set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") endif() + +# UBSAN needs gold linker +if (CTEST_MEMORYCHECK_TYPE STREQUAL "UndefinedBehaviorSanitizer") + set(GINKGO_CONFIGURE_OPTIONS "${GINKGO_CONFIGURE_OPTIONS};-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=gold;-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=gold") +endif() + ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" OPTIONS "${GINKGO_CONFIGURE_OPTIONS}" APPEND) ctest_submit(PARTS Configure) @@ -146,7 +169,7 @@ ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND) ctest_submit(PARTS Build) -if (CTEST_MEMORYCHECK_TYPE STREQUAL "None") +if (CTEST_MEMORYCHECK_TYPE STREQUAL "NONE") ctest_test(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND) ctest_submit(PARTS Test) endif() @@ -156,7 +179,7 @@ if (CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE") ctest_submit(PARTS Coverage) endif() -if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "None") +if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE") ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND) ctest_submit(PARTS MemCheck) endif() diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index e944ff6e933..0348f956e7b 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -35,6 +35,7 @@ set(GINKGO_EXE_LINKER_FLAGS_RELEASE "@CMAKE_EXE_LINKER_FLAGS_RELEASE@") set(GINKGO_BUILD_REFERENCE @GINKGO_BUILD_REFERENCE@) set(GINKGO_BUILD_OMP @GINKGO_BUILD_OMP@) set(GINKGO_BUILD_CUDA @GINKGO_BUILD_CUDA@) +set(GINKGO_BUILD_HIP @GINKGO_BUILD_HIP@) set(GINKGO_DEVEL_TOOLS @GINKGO_DEVEL_TOOLS@) set(GINKGO_BUILD_TESTS @GINKGO_BUILD_TESTS@) @@ -59,8 +60,16 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@) set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) set(GINKGO_CUDA_ARCHITECTURES @GINKGO_CUDA_ARCHITECTURES@) +set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@) set(GINKGO_CUDA_HOST_COMPILER @CMAKE_CUDA_HOST_COMPILER@) +set(GINKGO_HIP_COMPILER_FLAGS @GINKGO_HIP_COMPILER_FLAGS@) +set(GINKGO_HIP_HCC_COMPILER_FLAGS @GINKGO_HIP_HCC_COMPILER_FLAGS@) +set(GINKGO_HIP_NVCC_COMPILER_FLAGS @GINKGO_HIP_NVCC_COMPILER_FLAGS@) +set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@) +set(GINKGO_HIP_AMDGPU @GINKGO_HIP_AMDGPU@) +set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@) + set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@) # Ginkgo external package variables @@ -87,7 +96,10 @@ set(GINKGO_INSTALL_LIBRARY_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_LIBRARY set(GINKGO_INSTALL_PKGCONFIG_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_PKGCONFIG_DIR@") set(GINKGO_INSTALL_CONFIG_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_CONFIG_DIR@") set(GINKGO_INSTALL_MODULE_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_MODULE_DIR@") -set(CMAKE_MODULE_PATH "${GINKGO_INSTALL_MODULE_DIR}") + +# Forward Ginkgo's MODULE PATH and the PREFIX PATH for HIP and more +list(APPEND CMAKE_MODULE_PATH "@CMAKE_MODULE_PATH@" "${GINKGO_INSTALL_MODULE_DIR}") +list(APPEND CMAKE_PREFIX_PATH "@CMAKE_PREFIX_PATH@") set(GINKGO_INTERFACE_LINK_LIBRARIES "@GINKGO_INTERFACE_LINK_LIBRARIES@") @@ -104,9 +116,9 @@ set(GINKGO_CUSPARSE_LIBRARIES @CUSPARSE@) set(GINKGO_CUDA_LIBRARIES @CUDA_RUNTIME_LIBS@) set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@") -set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@") -set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG@") -set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE@") +set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@") +set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG_MODIFY@") +set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@") # OpenMP set(GINKGO_OPENMP_VERSION @OpenMP_CXX_VERSION@) @@ -116,10 +128,47 @@ set(GINKGO_OPENMP_LIBRARIES @OpenMP_CXX_LIBRARIES@) set(GINKGO_OPENMP_FLAGS "@OpenMP_CXX_FLAGS@") -# Modulepath configuration +# Provide useful HIP helper functions +include(${CMAKE_CURRENT_LIST_DIR}/hip_helpers.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/windows_helpers.cmake) # NOTE: we do not export benchmarks, examples, tests or devel tools # so `third_party` libraries are currently unneeded. +# propagate CUDA_HOST_COMPILER if Ginkgo was built with CUDA +if (GINKGO_BUILD_CUDA AND GINKGO_CUDA_HOST_COMPILER AND NOT CMAKE_CUDA_HOST_COMPILER) + message(STATUS "Ginkgo: Setting CUDA host compiler to ${GINKGO_CXX_COMPILER}") + set(CMAKE_CUDA_HOST_COMPILER "${GINKGO_CXX_COMPILER}" CACHE STRING "" FORCE) +endif() + +if(GINKGO_HAVE_PAPI_SDE) + find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde) +endif() + +# HIP depends on Threads::Threads in some circumstances, but doesn't find it +if (GINKGO_BUILD_HIP) + find_package(Threads REQUIRED) +endif() + +# Needed because of a known issue with CUDA while linking statically. +# For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614 +if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA) + enable_language(CUDA) +endif() + +if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) + find_package(HIP REQUIRED) + find_package(hipblas REQUIRED) + find_package(hipsparse REQUIRED) + if(GINKGO_HIP_PLATFORM MATCHES "hcc") + ginkgo_hip_ban_link_hcflag(hcc::hccrt) + ginkgo_hip_ban_link_hcflag(hcc::hc_am) + ginkgo_hip_ban_link_hcflag(hcc::mcwamp) + ginkgo_hip_ban_compile_hcflag(hcc::hccrt) + ginkgo_hip_ban_compile_hcflag(hcc::hc_am) + ginkgo_hip_ban_compile_hcflag(hcc::mcwamp) + endif() +endif() + include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake new file mode 100644 index 00000000000..1f90640acb9 --- /dev/null +++ b/cmake/autodetect_executors.cmake @@ -0,0 +1,27 @@ +set(GINKGO_HAS_OMP OFF) +set(GINKGO_HAS_CUDA OFF) +set(GINKGO_HAS_HIP OFF) +find_package(OpenMP) +include(CheckLanguage) +check_language(CUDA) + +if(OpenMP_CXX_FOUND) + if(NOT DEFINED GINKGO_BUILD_OMP) + message(STATUS "Enabling OpenMP executor") + endif() + set(GINKGO_HAS_OMP ON) +endif() + +if(CMAKE_CUDA_COMPILER) + if(NOT DEFINED GINKGO_BUILD_CUDA) + message(STATUS "Enabling CUDA executor") + endif() + set(GINKGO_HAS_CUDA ON) +endif() + +if(GINKGO_HIPCONFIG_PATH) + if(NOT DEFINED GINKGO_BUILD_HIP) + message(STATUS "Enabling HIP executor") + endif() + set(GINKGO_HAS_HIP ON) +endif() diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index 8a8ad047d56..860926add11 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -17,6 +17,9 @@ function(ginkgo_compile_features name) if(GINKGO_WITH_IWYU AND GINKGO_IWYU_PATH) set_property(TARGET "${name}" PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${GINKGO_IWYU_PATH}) endif() + # Set an appropriate SONAME + set_property(TARGET "${name}" PROPERTY + SOVERSION "${Ginkgo_VERSION}") if(GINKGO_CHANGED_SHARED_LIBRARY) # Put all shared libraries and corresponding imported libraries into the specified path set_property(TARGET "${name}" PROPERTY @@ -37,6 +40,61 @@ function(ginkgo_compile_features name) ginkgo_check_shared_library("${CMAKE_SHARED_LIBRARY_PREFIX}${name}${CMAKE_SHARED_LIBRARY_SUFFIX}") endif() endif() + + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries("${name}" PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + + set_target_properties("${name}" PROPERTIES POSITION_INDEPENDENT_CODE ON) +endfunction() + +function(ginkgo_check_headers target) + # build object library used to "compile" the headers + # add a proxy source file for each header in the target source list + file(GLOB_RECURSE CUDA_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" CONFIGURE_DEPENDS "*.cuh") + file(GLOB_RECURSE HIP_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" CONFIGURE_DEPENDS "*.hip.hpp") + file(GLOB_RECURSE CXX_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" CONFIGURE_DEPENDS "*.hpp") + list(FILTER CXX_HEADERS EXCLUDE REGEX ".*\.hip\.hpp$") + list(FILTER CXX_HEADERS EXCLUDE REGEX "^test.*") + list(FILTER CUDA_HEADERS EXCLUDE REGEX "^test.*") + list(FILTER HIP_HEADERS EXCLUDE REGEX "^test.*") + + set(SOURCES "") + foreach(HEADER ${CUDA_HEADERS}) + set(HEADER_SOURCEFILE "${CMAKE_CURRENT_BINARY_DIR}/${HEADER}.cu") + file(WRITE "${HEADER_SOURCEFILE}" "#include \"${HEADER}\"") + list(APPEND SOURCES "${HEADER_SOURCEFILE}") + endforeach() + + foreach(HEADER ${CXX_HEADERS}) + set(HEADER_SOURCEFILE "${CMAKE_CURRENT_BINARY_DIR}/${HEADER}.cpp") + file(WRITE "${HEADER_SOURCEFILE}" "#include \"${HEADER}\"") + list(APPEND SOURCES "${HEADER_SOURCEFILE}") + endforeach() + if (SOURCES) + add_library(${target}_headers OBJECT ${SOURCES}) + target_link_libraries(${target}_headers PRIVATE ${target}) + target_include_directories(${target}_headers PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") + endif() + + set(HIP_SOURCES "") + foreach(HEADER ${HIP_HEADERS}) + set(HEADER_SOURCEFILE "${CMAKE_CURRENT_BINARY_DIR}/${HEADER}.hip.cpp") + file(WRITE "${HEADER_SOURCEFILE}" "#include \"${HEADER}\"") + list(APPEND HIP_SOURCES "${HEADER_SOURCEFILE}") + endforeach() + if (HIP_SOURCES) + set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) + hip_add_library(${target}_headers_hip ${HIP_SOURCES}) # the compiler options get set by linking to ginkgo_hip + target_link_libraries(${target}_headers_hip PRIVATE ${target} roc::hipblas roc::hipsparse) + target_include_directories(${target}_headers_hip + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}" + "${GINKGO_HIP_THRUST_PATH}" + "${HIPBLAS_INCLUDE_DIRS}" + "${HIPSPARSE_INCLUDE_DIRS}" + "${ROCPRIM_INCLUDE_DIRS}") + endif() endfunction() function(ginkgo_check_shared_library name) @@ -73,25 +131,8 @@ function(ginkgo_check_shared_library name) endif() endfunction() -function(ginkgo_switch_windows_link lang from to) - foreach(flag_var - "CMAKE_${lang}_FLAGS" "CMAKE_${lang}_FLAGS_DEBUG" "CMAKE_${lang}_FLAGS_RELEASE" - "CMAKE_${lang}_FLAGS_MINSIZEREL" "CMAKE_${lang}_FLAGS_RELWITHDEBINFO" - ) - if(${flag_var} MATCHES "/${from}") - string(REGEX REPLACE "/${from}" "/${to}" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/${from}") - if(${flag_var} MATCHES "-${from}") - string(REGEX REPLACE "-${from}" "-${to}" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "-${from}") - set(${flag_var} "${${flag_var}}" CACHE STRING "" FORCE) - endforeach() -endfunction() - -macro(ginkgo_switch_to_windows_static lang) - ginkgo_switch_windows_link(${lang} "MD" "MT") -endmacro() - -macro(ginkgo_switch_to_windows_dynamic lang) - ginkgo_switch_windows_link(${lang} "MT" "MD") +macro(ginkgo_modify_flags name) + # add escape before " + # the result var is ${name}_MODIFY + string(REPLACE "\"" "\\\"" ${name}_MODIFY "${${name}}") endmacro() diff --git a/cmake/build_type_helpers.cmake b/cmake/build_type_helpers.cmake index eba35b828d6..f3366b031e0 100644 --- a/cmake/build_type_helpers.cmake +++ b/cmake/build_type_helpers.cmake @@ -27,18 +27,52 @@ include(CMakeDependentOption) -set(${PROJECT_NAME}_CUSTOM_BUILD_TYPES "COVERAGE;TSAN;ASAN" CACHE INTERNAL "") +set(${PROJECT_NAME}_CUSTOM_BUILD_TYPES "COVERAGE;TSAN;ASAN;LSAN;UBSAN" CACHE INTERNAL "") + +# LLVM provides all sanitizers in a single library, but they are separate in GCC +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(GKO_TSAN_LIBRARIES "-static-libsan") + set(GKO_UBSAN_LIBRARIES "-static-libsan") +else() + set(GKO_TSAN_LIBRARIES "-static-libtsan") + set(GKO_UBSAN_LIBRARIES "-static-libubsan") +endif() set(${PROJECT_NAME}_COVERAGE_COMPILER_FLAGS "-g -O0 --coverage" CACHE INTERNAL "") set(${PROJECT_NAME}_COVERAGE_LINKER_FLAGS "--coverage" CACHE INTERNAL "") -set(${PROJECT_NAME}_TSAN_COMPILER_FLAGS "-g -O1 -fsanitize=thread -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "") -set(${PROJECT_NAME}_TSAN_LINKER_FLAGS "-fsanitize=thread -static-libtsan -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "") -set(${PROJECT_NAME}_ASAN_COMPILER_FLAGS "-g -O1 -fsanitize=address -fno-omit-frame-pointer" CACHE INTERNAL "") -set(${PROJECT_NAME}_ASAN_LINKER_FLAGS "-fsanitize=address -fno-omit-frame-pointer" CACHE INTERNAL "") +set(${PROJECT_NAME}_TSAN_COMPILER_FLAGS "-g -O1 -fsanitize=thread -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "") +set(${PROJECT_NAME}_TSAN_LINKER_FLAGS "-fsanitize=thread ${GKO_TSAN_LIBRARIES} -fno-omit-frame-pointer -fPIC" CACHE INTERNAL "") +set(${PROJECT_NAME}_ASAN_COMPILER_FLAGS "-g -O1 -fsanitize=address -fno-omit-frame-pointer" CACHE INTERNAL "") +set(${PROJECT_NAME}_ASAN_LINKER_FLAGS "-fsanitize=address -fno-omit-frame-pointer" CACHE INTERNAL "") +set(${PROJECT_NAME}_LSAN_COMPILER_FLAGS "-g -O1 -fsanitize=leak" CACHE INTERNAL "") +set(${PROJECT_NAME}_LSAN_LINKER_FLAGS "-fsanitize=leak" CACHE INTERNAL "") +set(${PROJECT_NAME}_UBSAN_COMPILER_FLAGS "-g -O1 -fsanitize=undefined ${GKO_UBSAN_LIBRARIES}" CACHE INTERNAL "") +set(${PROJECT_NAME}_UBSAN_LINKER_FLAGS "-fsanitize=undefined ${GKO_UBSAN_LIBRARIES}" CACHE INTERNAL "") + +# We need to wrap all flags with `-Xcomplier` for HIP when using the NVCC backend +function(GKO_XCOMPILER varname varlist) + set(tmp "") + foreach(item IN LISTS varlist) + set(tmp "${tmp} -Xcompiler \\\\\\\"${item}\\\\\\\"") + endforeach() + set(${varname} "${tmp}" CACHE INTERNAL "") +endfunction() + +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_COVERAGE_COMPILER_FLAGS "-g;-O0;--coverage") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_COVERAGE_LINKER_FLAGS "--coverage") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_TSAN_COMPILER_FLAGS "-g;-O1;-fsanitize=thread;-fno-omit-frame-pointer;-fPIC") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_TSAN_LINKER_FLAGS "-fsanitize=thread;-static-libtsan;-fno-omit-frame-pointer;-fPIC") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_ASAN_COMPILER_FLAGS "-g;-O1;-fsanitize=address;-fno-omit-frame-pointer") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_ASAN_LINKER_FLAGS "-fsanitize=address;-fno-omit-frame-pointer") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_LSAN_COMPILER_FLAGS "-g;-O1;-fsanitize=leak") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_LSAN_LINKER_FLAGS "-fsanitize=leak") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_UBSAN_COMPILER_FLAGS "-g;-O1;-fsanitize=undefined;-static-libubsan") +GKO_XCOMPILER(${PROJECT_NAME}_NVCC_UBSAN_LINKER_FLAGS "-fsanitize=undefined;-static-libubsan") + get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) -foreach(_LANG IN LISTS ENABLED_LANGUAGES) +foreach(_LANG IN LISTS ENABLED_LANGUAGES ITEMS "HIP") include(Check${_LANG}CompilerFlag OPTIONAL) foreach(_TYPE IN LISTS ${PROJECT_NAME}_CUSTOM_BUILD_TYPES) # Required for check__compiler_flag. Caution, this can break several @@ -49,7 +83,7 @@ foreach(_LANG IN LISTS ENABLED_LANGUAGES) if(_LANG STREQUAL "C") check_c_compiler_flag("${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}" ${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED) - elseif(_LANG STREQUAL "CXX") + elseif(_LANG STREQUAL "CXX" OR _LANG STREQUAL "HIP") check_cxx_compiler_flag("${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}" ${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED) else() @@ -60,13 +94,23 @@ foreach(_LANG IN LISTS ENABLED_LANGUAGES) continue() endif() if(${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED) - set(CMAKE_${_LANG}_FLAGS_${_TYPE} - ${${PROJECT_NAME}_${_TYPE}_COMPILER_FLAGS} - CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE - ) - mark_as_advanced(CMAKE_${_LANG}_FLAGS_${_TYPE}) - set(${PROJECT_NAME}_${_TYPE}_SUPPORTED TRUE CACHE - STRING "Whether or not coverage is supported by at least one compiler." FORCE) + if(_LANG STREQUAL "HIP" AND GINKGO_HIP_PLATFORM STREQUAL "nvcc") + set(CMAKE_${_LANG}_FLAGS_${_TYPE} + ${${PROJECT_NAME}_NVCC_${_TYPE}_COMPILER_FLAGS} + CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE + ) + mark_as_advanced(CMAKE_${_LANG}_FLAGS_${_TYPE}) + set(${PROJECT_NAME}_${_TYPE}_SUPPORTED TRUE CACHE + STRING "Whether or not coverage is supported by at least one compiler." FORCE) + else() + set(CMAKE_${_LANG}_FLAGS_${_TYPE} + ${${PROJECT_NAME}_${_TYPE}_COMPILER_FLAGS} + CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE + ) + mark_as_advanced(CMAKE_${_LANG}_FLAGS_${_TYPE}) + set(${PROJECT_NAME}_${_TYPE}_SUPPORTED TRUE CACHE + STRING "Whether or not coverage is supported by at least one compiler." FORCE) + endif() endif() set(CMAKE_REQUIRED_LIBRARIES ${_CMAKE_REQUIRED_LIBRARIES}) endforeach() @@ -74,21 +118,6 @@ endforeach() foreach(_TYPE IN LISTS ${PROJECT_NAME}_CUSTOM_BUILD_TYPES) - if(${PROJECT_NAME}_${_TYPE}_SUPPORTED) - set(CMAKE_EXE_LINKER_FLAGS_${_TYPE} - "${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}" - CACHE STRING "Flags used for linking binaries during ${_TYPE} builds." FORCE - ) - set(CMAKE_SHARED_LINKER_FLAGS_${_TYPE} - "${${PROJECT_NAME}_${_TYPE}_LINKER_FLAGS}" - CACHE STRING "Flags used by the shared libraries linker during ${_TYPE} builds." FORCE - ) - mark_as_advanced( - CMAKE_EXE_LINKER_FLAGS_${_TYPE} - CMAKE_SHARED_LINKER_FLAGS_${_TYPE} - ) - endif() - cmake_dependent_option(${PROJECT_NAME}_${_TYPE}_IN_CONFIGURATION_TYPES "Should the ${_TYPE} target be in the CMAKE_CONFIGURATION_TYPES list if supported ?" ON # No need for this option if we are not using a multi-config generator diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index f6e2e165437..ca639fe5278 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,29 +1,158 @@ function(ginkgo_create_test test_name) file(RELATIVE_PATH REL_BINARY_DIR - ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") add_executable(${TEST_TARGET_NAME} ${test_name}.cpp) target_include_directories("${TEST_TARGET_NAME}" PRIVATE - "$" + "$" ) set_target_properties(${TEST_TARGET_NAME} PROPERTIES OUTPUT_NAME ${test_name}) - target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::GTest GTest::Main ${ARGN}) + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN}) add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME}) endfunction(ginkgo_create_test) +function(ginkgo_create_thread_test test_name) + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + file(RELATIVE_PATH REL_BINARY_DIR + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") + add_executable(${TEST_TARGET_NAME} ${test_name}.cpp) + target_include_directories("${TEST_TARGET_NAME}" + PRIVATE + "$" + ) + set_target_properties(${TEST_TARGET_NAME} PROPERTIES + OUTPUT_NAME ${test_name}) + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest Threads::Threads ${ARGN}) + add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME}) +endfunction(ginkgo_create_thread_test) + +function(ginkgo_create_test_cpp_cuda_header test_name) + file(RELATIVE_PATH REL_BINARY_DIR + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") + add_executable(${TEST_TARGET_NAME} ${test_name}.cpp) + target_include_directories("${TEST_TARGET_NAME}" + PRIVATE + "$" + "${CUDA_INCLUDE_DIRS}" + ) + set_target_properties(${TEST_TARGET_NAME} PROPERTIES + OUTPUT_NAME ${test_name}) + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN}) + add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME}) +endfunction(ginkgo_create_test_cpp_cuda_header) + function(ginkgo_create_cuda_test test_name) file(RELATIVE_PATH REL_BINARY_DIR - ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") add_executable(${TEST_TARGET_NAME} ${test_name}.cu) target_include_directories("${TEST_TARGET_NAME}" PRIVATE - "$" + "$" ) set_target_properties(${TEST_TARGET_NAME} PROPERTIES OUTPUT_NAME ${test_name}) - target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::GTest GTest::Main ${ARGN}) + + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN}) add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME}) endfunction(ginkgo_create_cuda_test) + +function(ginkgo_create_hip_test_special_linkage test_name) + # use gcc to compile but use hip to link + file(RELATIVE_PATH REL_BINARY_DIR + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") + add_executable(${TEST_TARGET_NAME} ${test_name}.cpp) + # Fix the missing metadata when building static library. + if(GINKGO_HIP_PLATFORM MATCHES "hcc" AND NOT BUILD_SHARED_LIBS) + set_target_properties(${TEST_TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) + endif() + target_include_directories("${TEST_TARGET_NAME}" + PRIVATE + "$" + ) + set_target_properties(${TEST_TARGET_NAME} PROPERTIES + OUTPUT_NAME ${test_name}) + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN}) + add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME}) +endfunction(ginkgo_create_hip_test_special_linkage) + +function(ginkgo_create_hip_test test_name) + file(RELATIVE_PATH REL_BINARY_DIR + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") + + set_source_files_properties(${test_name}.hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) + + if (HIP_VERSION GREATER_EQUAL "3.5") + hip_add_executable(${TEST_TARGET_NAME} ${test_name}.hip.cpp + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} + HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS} + CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}) + else() + hip_add_executable(${TEST_TARGET_NAME} ${test_name}.hip.cpp + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} + HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS}) + endif() + + # Let's really not use nvcc for linking here + if (GINKGO_HIP_PLATFORM MATCHES "nvcc") + set_target_properties(${TEST_TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) + endif() + + target_include_directories("${TEST_TARGET_NAME}" + PRIVATE + "$" + # Only `math` requires it so far, but it's much easier + # to put these this way. + ${GINKGO_HIP_THRUST_PATH} + # Only `exception_helpers` requires thess so far, but it's much easier + # to put these this way. + ${HIPBLAS_INCLUDE_DIRS} + ${HIPSPARSE_INCLUDE_DIRS} + ) + set_target_properties(${TEST_TARGET_NAME} PROPERTIES + OUTPUT_NAME ${test_name}) + + # Pass in the `--amdgpu-target` flags if asked + if(GINKGO_HIP_AMDGPU AND GINKGO_HIP_PLATFORM MATCHES "hcc") + foreach(target ${GINKGO_HIP_AMDGPU}) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE --amdgpu-target=${target}) + endforeach() + endif() + + # GINKGO_RPATH_FOR_HIP needs to be populated before calling this for the linker to include + # our libraries path into the executable's runpath. + if(BUILD_SHARED_LIBS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_RPATH_FOR_HIP}") + + if (GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + endif() + + target_link_libraries(${TEST_TARGET_NAME} PRIVATE ginkgo GTest::Main GTest::GTest ${ARGN}) + add_test(NAME ${REL_BINARY_DIR}/${test_name} COMMAND ${TEST_TARGET_NAME}) +endfunction(ginkgo_create_hip_test) diff --git a/cmake/hip_helpers.cmake b/cmake/hip_helpers.cmake new file mode 100644 index 00000000000..c296ffc1228 --- /dev/null +++ b/cmake/hip_helpers.cmake @@ -0,0 +1,30 @@ +macro(ginkgo_hip_ban_link_hcflag target) + if(TARGET ${target}) + get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_LINK_LIBRARIES) + string(REPLACE "-hc " "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}") + set_target_properties(${target} PROPERTIES INTERFACE_LINK_LIBRARIES "${GINKGO_TARGET_NEW_ILL}") + endif() +endmacro() + +macro(ginkgo_hip_ban_compile_hcflag target) + if(TARGET ${target}) + get_target_property(GINKGO_TARGET_ILL ${target} INTERFACE_COMPILE_OPTIONS) + string(REPLACE "-hc" "" GINKGO_TARGET_NEW_ILL "${GINKGO_TARGET_ILL}") + set_target_properties(${target} PROPERTIES INTERFACE_COMPILE_OPTIONS "${GINKGO_TARGET_NEW_ILL}") + endif() +endmacro() + +macro(ginkgo_hip_clang_ban_hip_device_flags) + if (GINKGO_HIP_VERSION VERSION_GREATER_EQUAL "3.5") + # Compile options somehow add hip-clang specific flags. Wipe them. + # Currently, the flags wiped out should be: + # -x;hip;--hip-device-lib-path=/opt/rocm/lib;--cuda-gpu-arch=gfx900; + # --cuda-gpu-arch=gfx906 + set_target_properties(hip::device PROPERTIES INTERFACE_COMPILE_OPTIONS "") + # In addition, link libraries have a similar problem. We only keep + # `hip::host`. Currently, the flags should be: + # hip::host;--hip-device-lib-path=/opt/rocm/lib;--hip-link; + # --cuda-gpu-arch=gfx900;--cuda-gpu-arch=gfx906 + set_target_properties(hip::device PROPERTIES INTERFACE_LINK_LIBRARIES "hip::host") + endif() +endmacro() diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake new file mode 100644 index 00000000000..aa0e116527b --- /dev/null +++ b/cmake/hip_path.cmake @@ -0,0 +1,13 @@ +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + set(ENV{HIP_PATH} ${HIP_PATH}) + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() + +find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") +if(GINKGO_HIPCONFIG_PATH) + message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}") +endif() \ No newline at end of file diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake index 13b3f85061d..e128fb5869a 100644 --- a/cmake/information_helpers.cmake +++ b/cmake/information_helpers.cmake @@ -18,12 +18,12 @@ macro(ginkgo_git_information) OUTPUT_VARIABLE GINKGO_GIT_BRANCH OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process( - COMMAND ${GIT_EXECUTABLE} log -1 --format=%H ${Gingko_SOURCE_DIR} + COMMAND ${GIT_EXECUTABLE} log -1 --format=%H ${Ginkgo_SOURCE_DIR} WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR} OUTPUT_VARIABLE GINKGO_GIT_REVISION OUTPUT_STRIP_TRAILING_WHITESPACE) execute_process( - COMMAND ${GIT_EXECUTABLE} log -1 --format=%h ${Gingko_SOURCE_DIR} + COMMAND ${GIT_EXECUTABLE} log -1 --format=%h ${Ginkgo_SOURCE_DIR} WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR} OUTPUT_VARIABLE GINKGO_GIT_SHORTREV OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake index fd0c90d383f..ba7ea3fd468 100644 --- a/cmake/install_helpers.cmake +++ b/cmake/install_helpers.cmake @@ -9,12 +9,23 @@ set(GINKGO_INSTALL_CONFIG_DIR "lib/cmake/Ginkgo") set(GINKGO_INSTALL_MODULE_DIR "lib/cmake/Ginkgo/Modules") function(ginkgo_install_library name subdir) - # install .so and .a files - install(TARGETS "${name}" - EXPORT Ginkgo - LIBRARY DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} - ARCHIVE DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} + + if (WIN32 OR CYGWIN) + # dll is considered as runtime + install(TARGETS "${name}" + EXPORT Ginkgo + LIBRARY DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} + ARCHIVE DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} + RUNTIME DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} + ) + else () + # install .so and .a files + install(TARGETS "${name}" + EXPORT Ginkgo + LIBRARY DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} + ARCHIVE DESTINATION ${GINKGO_INSTALL_LIBRARY_DIR} ) + endif () endfunction() function(ginkgo_install) @@ -59,9 +70,14 @@ function(ginkgo_install) install(FILES "${Ginkgo_BINARY_DIR}/GinkgoConfig.cmake" "${Ginkgo_BINARY_DIR}/GinkgoConfigVersion.cmake" - "${Ginkgo_BINARY_DIR}/GinkgoTargets.cmake" + "${Ginkgo_SOURCE_DIR}/cmake/hip_helpers.cmake" + "${Ginkgo_SOURCE_DIR}/cmake/windows_helpers.cmake" DESTINATION "${GINKGO_INSTALL_CONFIG_DIR}" ) + install(EXPORT Ginkgo + NAMESPACE Ginkgo:: + FILE GinkgoTargets.cmake + DESTINATION "${GINKGO_INSTALL_CONFIG_DIR}") # Export package for use from the build tree if (GINKGO_EXPORT_BUILD_DIR) diff --git a/cmake/windows_helpers.cmake b/cmake/windows_helpers.cmake new file mode 100644 index 00000000000..5f517a555ad --- /dev/null +++ b/cmake/windows_helpers.cmake @@ -0,0 +1,22 @@ +function(ginkgo_switch_windows_link lang from to) + foreach(flag_var + "CMAKE_${lang}_FLAGS" "CMAKE_${lang}_FLAGS_DEBUG" "CMAKE_${lang}_FLAGS_RELEASE" + "CMAKE_${lang}_FLAGS_MINSIZEREL" "CMAKE_${lang}_FLAGS_RELWITHDEBINFO" + ) + if(${flag_var} MATCHES "/${from}") + string(REGEX REPLACE "/${from}" "/${to}" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/${from}") + if(${flag_var} MATCHES "-${from}") + string(REGEX REPLACE "-${from}" "-${to}" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "-${from}") + set(${flag_var} "${${flag_var}}" CACHE STRING "" FORCE) + endforeach() +endfunction() + +macro(ginkgo_switch_to_windows_static lang) + ginkgo_switch_windows_link(${lang} "MD" "MT") +endmacro() + +macro(ginkgo_switch_to_windows_dynamic lang) + ginkgo_switch_windows_link(${lang} "MT" "MD") +endmacro() diff --git a/codecov.yml b/codecov.yml index a065fc381f9..bdb86838644 100644 --- a/codecov.yml +++ b/codecov.yml @@ -5,10 +5,12 @@ coverage: target: auto threshold: 5 base: auto + informational: True project: default: target: auto threshold: 2 base: auto ignore: - - "**/test/" + - "examples" + - "benchmark" diff --git a/common/base/executor.hpp.inc b/common/base/executor.hpp.inc new file mode 100644 index 00000000000..5b85069c4d8 --- /dev/null +++ b/common/base/executor.hpp.inc @@ -0,0 +1,85 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace { + + +// The function is copied from _ConvertSMVer2Cores of +// cuda-9.2/samples/common/inc/helper_cuda.h +inline int convert_sm_ver_to_cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class + {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class + {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class + {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class + {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class + {0x70, 64}, // Volta Generation (SM 7.0) GV100 class + {0x72, 64}, // Volta Generation (SM 7.2) GV11b class + {0x75, 64}, // Turing Generation (SM 7.5) TU1xx class + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + +#if GKO_VERBOSE_LEVEL >= 1 + // If we don't find the values, we use the last valid value by default + // to allow proper execution + std::cerr << "MapSMtoCores for SM " << major << "." << minor + << "is undefined. The default value of " + << nGpuArchCoresPerSM[index - 1].Cores << " Cores/SM is used." + << std::endl; +#endif + return nGpuArchCoresPerSM[index - 1].Cores; +} + + +} // namespace \ No newline at end of file diff --git a/common/base/math.hpp.inc b/common/base/math.hpp.inc new file mode 100644 index 00000000000..3ba49b585c3 --- /dev/null +++ b/common/base/math.hpp.inc @@ -0,0 +1,63 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// We need this struct, because otherwise we would call a __host__ function in a +// __device__ function (even though it is constexpr) +template +struct device_numeric_limits { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + + +namespace detail { + + +template +struct remove_complex_impl> { + using type = T; +}; + + +template +struct is_complex_impl> + : public std::integral_constant {}; + + +template +struct truncate_type_impl> { + using type = thrust::complex::type>; +}; + + +} // namespace detail \ No newline at end of file diff --git a/common/components/atomic.hpp.inc b/common/components/atomic.hpp.inc new file mode 100644 index 00000000000..e36306e04d9 --- /dev/null +++ b/common/components/atomic.hpp.inc @@ -0,0 +1,152 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +namespace detail { + + +template +struct atomic_helper { + __forceinline__ __device__ static ValueType atomic_add(ValueType *, + ValueType) + { + static_assert(sizeof(ValueType) == 0, + "This default function is not implemented, only the " + "specializations are."); + // TODO: add proper implementation of generic atomic add + } +}; + + +template +__forceinline__ __device__ ResultType reinterpret(ValueType val) +{ + static_assert(sizeof(ValueType) == sizeof(ResultType), + "The type to reinterpret to must be of the same size as the " + "original type."); + return reinterpret_cast(val); +} + + +#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE) \ + template \ + struct atomic_helper> { \ + __forceinline__ __device__ static ValueType atomic_add( \ + ValueType *__restrict__ addr, ValueType val) \ + { \ + CONVERTER_TYPE *address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed; \ + do { \ + assumed = old; \ + old = atomicCAS(address_as_converter, assumed, \ + reinterpret( \ + val + reinterpret(assumed))); \ + } while (assumed != old); \ + return reinterpret(old); \ + } \ + }; + +// Support 64-bit ATOMIC_ADD +GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); +// Support 32-bit ATOMIC_ADD +GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); + + +#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) +// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS +GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); +#endif + +#undef GKO_BIND_ATOMIC_HELPER_STRUCTURE + + +} // namespace detail + + +template +__forceinline__ __device__ T atomic_add(T *__restrict__ addr, T val) +{ + return detail::atomic_helper::atomic_add(addr, val); +} + + +#define GKO_BIND_ATOMIC_ADD(ValueType) \ + __forceinline__ __device__ ValueType atomic_add( \ + ValueType *__restrict__ addr, ValueType val) \ + { \ + return atomicAdd(addr, val); \ + } + +GKO_BIND_ATOMIC_ADD(int); +GKO_BIND_ATOMIC_ADD(unsigned int); +GKO_BIND_ATOMIC_ADD(unsigned long long int); +GKO_BIND_ATOMIC_ADD(float); + + +#if !defined(__HIPCC__) || \ + (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC) + + +#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) +// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute +// capability 6.x and higher +GKO_BIND_ATOMIC_ADD(double); +#endif + +#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) +// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices +// of compute capability 7.x and higher. +GKO_BIND_ATOMIC_ADD(__half); +#endif + +#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) +// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on +// devices of compute capability 6.x and higher. note: The atomicity of the +// __half2 add operation is guaranteed separately for each of the two __half +// elements; the entire __half2 is not guaranteed to be atomic as a single +// 32-bit access. +GKO_BIND_ATOMIC_ADD(__half2); +#endif + + +#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && + // GINKGO_HIP_PLATFORM_HCC) + + +#undef GKO_BIND_ATOMIC_ADD \ No newline at end of file diff --git a/common/components/diagonal_block_manipulation.hpp.inc b/common/components/diagonal_block_manipulation.hpp.inc new file mode 100644 index 00000000000..2270bc293d7 --- /dev/null +++ b/common/components/diagonal_block_manipulation.hpp.inc @@ -0,0 +1,93 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * + * @note assumes that block dimensions are in "standard format": + * (subwarp_size, config::warp_size / subwarp_size, z) + */ +template < + int max_block_size, int warps_per_block, typename Group, typename ValueType, + typename IndexType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ void extract_transposed_diag_blocks( + const Group &group, int processed_blocks, + const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, + const IndexType *__restrict__ block_ptrs, size_type num_blocks, + ValueType *__restrict__ block_row, int increment, + ValueType *__restrict__ workspace) +{ + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + const auto warp = group::tiled_partition(group); + auto bid = static_cast(blockIdx.x) * warps_per_block * + processed_blocks + + threadIdx.z * processed_blocks; + auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero(); + IndexType bsize = 0; +#pragma unroll + for (int b = 0; b < processed_blocks; ++b, ++bid) { + if (bid < num_blocks) { + bstart += bsize; + bsize = block_ptrs[bid + 1] - bstart; +#pragma unroll + for (int i = 0; i < max_block_size; ++i) { + if (i < bsize) { + if (threadIdx.y == b && threadIdx.x < max_block_size) { + workspace[threadIdx.x] = zero(); + } + warp.sync(); + const auto row = bstart + i; + const auto rstart = row_ptrs[row] + tid; + const auto rend = row_ptrs[row + 1]; + // use the entire warp to ensure coalesced memory access + for (auto j = rstart; j < rend; j += config::warp_size) { + const auto col = col_idxs[j] - bstart; + if (col >= bsize) { + break; + } + if (col >= 0) { + workspace[col] = values[j]; + } + } + warp.sync(); + if (threadIdx.y == b && threadIdx.x < bsize) { + block_row[i * increment] = workspace[threadIdx.x]; + } + warp.sync(); + } + } + } + } +} \ No newline at end of file diff --git a/common/components/fill_array.hpp.inc b/common/components/fill_array.hpp.inc new file mode 100644 index 00000000000..04e6fe67b79 --- /dev/null +++ b/common/components/fill_array.hpp.inc @@ -0,0 +1,48 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void fill_array( + size_type n, ValueType *__restrict__ array, ValueType val) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < n) { + array[tidx] = val; + } +} + + +} // namespace kernel diff --git a/common/components/intrinsics.hpp.inc b/common/components/intrinsics.hpp.inc new file mode 100644 index 00000000000..f89fa434eb4 --- /dev/null +++ b/common/components/intrinsics.hpp.inc @@ -0,0 +1,66 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * Returns the number of set bits in the given mask. + */ +__forceinline__ __device__ int popcnt(uint32 mask) { return __popc(mask); } + +/** @copydoc popcnt */ +__forceinline__ __device__ int popcnt(uint64 mask) { return __popcll(mask); } + + +/** + * @internal + * Returns the (1-based!) index of the first set bit in the given mask, + * starting from the least significant bit. + */ +__forceinline__ __device__ int ffs(uint32 mask) { return __ffs(mask); } + +/** @copydoc ffs */ +__forceinline__ __device__ int ffs(uint64 mask) +{ + // the cast is necessary, as the overloads defined by HIP are ambiguous + return __ffsll(static_cast(mask)); +} + + +/** + * @internal + * Returns the number of zero bits before the first set bit in the given mask, + * starting from the most significant bit. + */ +__forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); } + +/** @copydoc clz */ +__forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); } diff --git a/common/components/merging.hpp.inc b/common/components/merging.hpp.inc new file mode 100644 index 00000000000..c91f76e1fd4 --- /dev/null +++ b/common/components/merging.hpp.inc @@ -0,0 +1,310 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace detail { + + +/** + * @internal + * The result from the @ref group_merge_step function. + */ +template +struct merge_result { + /** The element of a being merged in the current thread. */ + ValueType a_val; + /** The element of b being merged in the current thread. */ + ValueType b_val; + /** The index from a that is being merged in the current thread. */ + int a_idx; + /** The index from b that is being merged in the current thread. */ + int b_idx; + /** The number of elements from a that have been merged in total. */ + int a_advance; + /** The number of elements from b that have been merged in total. */ + int b_advance; +}; + +} // namespace detail + + +/** + * @internal + * Warp-parallel merge algorithm that merges the first `warp_size` elements from + * two ranges, where each warp stores a single element from each range. + * It assumes that the elements are sorted in ascending order, i.e. for i < j, + * the value of `a` at thread i is smaller or equal to the value at thread j, + * and the same holds for `b`. + * + * This implementation is based on ideas from Green et al., + * "GPU merge path: a GPU merging algorithm", but uses random-access warp + * shuffles instead of shared-memory to exchange values of a and b. + * + * @param a the element from the first range + * @param b the element from the second range + * @param size the number of elements in the output range + * @param group the cooperative group that executes the merge + * @return a structure containing the merge result distributed over the group. + */ +template +__forceinline__ __device__ detail::merge_result group_merge_step( + ValueType a, ValueType b, Group group) +{ + // thread i takes care of ith element of the merged sequence + auto i = int(group.thread_rank()); + + // we want to find the smallest index `x` such that a[x] >= b[i - x - 1] + // or `i` if no such index exists + // + // if x = i then c[0...i - 1] = a[0...i - 1] + // => merge a[i] with b[0] + // if x = 0 then c[0...i - 1] = b[0...i - 1] + // => merge a[0] with b[i] + // otherwise c[0...i - 1] contains a[0...x - 1] and b[0...i - x - 1] + // because the minimality of `x` implies + // b[i - x] >= a[x - 1] + // and a[x] >= a[0...x - 1], b[0...i - x - 1] + // => merge a[x] with b[i - x] + auto minx = synchronous_fixed_binary_search([&](int x) { + auto a_remote = group.shfl(a, x); + auto b_remote = group.shfl(b, max(i - x - 1, 0)); + return a_remote >= b_remote || x >= i; + }); + + auto a_idx = minx; + auto b_idx = max(i - minx, 0); + auto a_val = group.shfl(a, a_idx); + auto b_val = group.shfl(b, b_idx); + auto cmp = a_val < b_val; + auto a_advance = popcnt(group.ballot(cmp)); + auto b_advance = int(group.size()) - a_advance; + + return {a_val, b_val, a_idx, b_idx, a_advance, b_advance}; +} + + +/** + * @internal + * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary + * size. `merge_fn` will be called for each merged element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param group the group that executes the merge + * @param merge_fn the callback that is being called for each merged element. + * It takes six parameters: + * `IndexType a_idx, ValueType a_val, IndexType b_idx, + * ValueType b_val, IndexType c_index, bool valid`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + * `valid` specifies if the current thread has to merge an + * element (this is necessary for shfl and ballot operations). + * It must return `false` on all threads of the group iff the + * merge shouldn't be continued. + */ +template +__forceinline__ __device__ void group_merge(const ValueType *__restrict__ a, + IndexType a_size, + const ValueType *__restrict__ b, + IndexType b_size, Group group, + Callback merge_fn) +{ + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + auto lane = static_cast(group.thread_rank()); + auto sentinel = device_numeric_limits::max; + auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel); + auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel); + for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) { + auto merge_result = group_merge_step(a_cur, b_cur, group); + auto valid = c_begin + lane < c_size; + auto cont = merge_fn(merge_result.a_idx + a_begin, merge_result.a_val, + merge_result.b_idx + b_begin, merge_result.b_val, + c_begin + lane, valid); + if (!group.any(cont && valid)) { + break; + } + auto a_advance = merge_result.a_advance; + auto b_advance = merge_result.b_advance; + a_begin += a_advance; + b_begin += b_advance; + + // shuffle the unmerged elements to the front + a_cur = group.shfl_down(a_cur, a_advance); + b_cur = group.shfl_down(b_cur, b_advance); + /* + * To optimize memory access, we load the new elements for `a` and `b` + * with a single load instruction: + * the lower part of the group loads new elements for `a` + * the upper part of the group loads new elements for `b` + * `load_lane` is the part-local lane idx + * The elements for `a` have to be shuffled up afterwards. + */ + auto load_a = lane < a_advance; + auto load_lane = load_a ? lane : lane - a_advance; + auto load_source = load_a ? a : b; + auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance; + auto load_size = load_a ? a_size : b_size; + + auto load_idx = load_begin + load_lane; + auto loaded = checked_load(load_source, load_idx, load_size, sentinel); + // shuffle the `a` values to the end of the warp + auto lower_loaded = group.shfl_up(loaded, b_advance); + a_cur = lane < b_advance ? a_cur : lower_loaded; + b_cur = lane < a_advance ? b_cur : loaded; + } +} + + +/** + * @internal + * Warp-parallel merge algorithm that reports matching elements from two sorted + * ranges of arbitrary size. `merge_fn` will be called for each pair of matching + * element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param group the group that executes the merge + * @param match_fn the callback that is being called for each matching pair. + * It takes five parameters: + * `ValueType val, IndexType a_idx, IndexType b_idx, + * lane_mask_type match_mask, bool valid`. + * `val` is the matching element, `*_idx` are the indices of + * the matching values from a and b, match_mask is a lane mask + * that is 1 for every subwarp lane that found a match. + * `valid` is true iff there is actually a match. + * (necessary for warp-synchronous operations) + */ +template +__forceinline__ __device__ void group_match(const ValueType *__restrict__ a, + IndexType a_size, + const ValueType *__restrict__ b, + IndexType b_size, Group group, + Callback match_fn) +{ + group_merge( + a, a_size, b, b_size, group, + [&](IndexType a_idx, ValueType a_val, IndexType b_idx, ValueType b_val, + IndexType, bool valid) { + auto matchmask = group.ballot(a_val == b_val && valid); + match_fn(a_val, a_idx, b_idx, matchmask, a_val == b_val && valid); + return a_idx < a_size && b_idx < b_size; + }); +} + + +/** + * @internal + * Sequential merge algorithm that merges two sorted ranges of arbitrary + * size. `merge_fn` will be called for each merged element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param merge_fn the callback that will be called for each merge step. + * It takes five parameters: + * `IndexType a_idx, ValueType a_val, + * IndexType b_idx, ValueType b_val, IndexType c_idx`. + * `*_val` and `*_idx` are the values resp. the indices of + * the values from a/b being compared in step `c_idx`. + * It must return `false` iff the merge should stop. + */ +template +__forceinline__ __device__ void sequential_merge( + const ValueType *__restrict__ a, IndexType a_size, + const ValueType *__restrict__ b, IndexType b_size, Callback merge_fn) +{ + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + auto sentinel = device_numeric_limits::max; + auto a_cur = checked_load(a, a_begin, a_size, sentinel); + auto b_cur = checked_load(b, b_begin, b_size, sentinel); + for (IndexType c_begin{}; c_begin < c_size; c_begin++) { + auto cont = merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin); + if (!cont) { + break; + } + auto a_advance = a_cur < b_cur; + auto b_advance = !a_advance; + a_begin += a_advance; + b_begin += b_advance; + + auto load = a_advance ? a : b; + auto load_size = a_advance ? a_size : b_size; + auto load_idx = a_advance ? a_begin : b_begin; + auto loaded = checked_load(load, load_idx, load_size, sentinel); + a_cur = a_advance ? loaded : a_cur; + b_cur = b_advance ? loaded : b_cur; + } +} + + +/** + * @internal + * Sequential algorithm that finds matching elements in two sorted ranges of + * arbitrary size. `merge_fn` will be called for each pair of matching + * elements. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param match_fn the callback that is being called for each match. + * It takes three parameters: + * `ValueType val, IndexType a_idx, IndexType b_idx`. + * `val` is the matching element, `*_idx` are the + * indices of the matching values from a and b. + */ +template +__forceinline__ __device__ void sequential_match(const ValueType *a, + IndexType a_size, + const ValueType *b, + IndexType b_size, + Callback match_fn) +{ + sequential_merge(a, a_size, b, b_size, + [&](IndexType a_idx, ValueType a_val, IndexType b_idx, + ValueType b_val, IndexType) { + if (a_val == b_val) { + match_fn(a_val, a_idx, b_idx); + } + return a_idx < a_size && b_idx < b_size; + }); +} \ No newline at end of file diff --git a/common/components/precision_conversion.hpp.inc b/common/components/precision_conversion.hpp.inc new file mode 100644 index 00000000000..c486354f156 --- /dev/null +++ b/common/components/precision_conversion.hpp.inc @@ -0,0 +1,41 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ void convert_precision(size_type size, const SourceType *in, + TargetType *out) +{ + auto tnum = thread::get_thread_num_flat(); + for (auto i = thread::get_thread_id_flat(); i < size; i += tnum) { + out[i] = in[i]; + } +} \ No newline at end of file diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc new file mode 100644 index 00000000000..9db51a3dc4f --- /dev/null +++ b/common/components/prefix_sum.hpp.inc @@ -0,0 +1,186 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +/** + * @internal + * Computes the prefix sum and total sum of `element` over a subwarp. + * + * @param element the element over which we compute the prefix sum. + * @param prefix_sum will be set to the sum of all `element`s from lower + * lanes, plus the local `element` if `inclusive` is `true`. + * @param total_sum will be set to the total sum of `element` in this subwarp. + * @param subwarp the cooperative group representing the subwarp. + * + * @tparam inclusive if this is true, the computed prefix sum will be + * inclusive, otherwise it will be exclusive. + * + * @note For this function to work on architectures with independent thread + * scheduling, all threads of the subwarp have to execute it. + */ +template +__forceinline__ __device__ void subwarp_prefix_sum(ValueType element, + ValueType &prefix_sum, + ValueType &total_sum, + Group subwarp) +{ + prefix_sum = inclusive ? element : zero(); + total_sum = element; +#pragma unroll + // hypercube prefix sum + for (auto step = 1; step < subwarp.size(); step *= 2) { + auto neighbor = subwarp.shfl_xor(total_sum, step); + total_sum += neighbor; + prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; + } +} + +/** + * @internal + * Computes the prefix sum of `element` over a subwarp. + * + * @param element the element over which we compute the prefix sum. + * @param prefix_sum will be set to the sum of all `element`s from lower + * lanes, plus the local `element` if `inclusive` is `true`. + * @param subwarp the cooperative group representing the subwarp. + * + * @tparam inclusive if this is true, the computed prefix sum will be + * inclusive, otherwise it will be exclusive. + * + * @note All threads of the subwarp have to execute this function for it to work + * (and not dead-lock on newer architectures). + */ +template +__forceinline__ __device__ void subwarp_prefix_sum(ValueType element, + ValueType &prefix_sum, + Group subwarp) +{ + ValueType tmp{}; + subwarp_prefix_sum(element, prefix_sum, tmp, subwarp); +} + + +/** + * @internal + * First step of the calculation of a prefix sum. Calculates the prefix sum + * in-place on parts of the array `elements`. + * + * @param elements array on which the prefix sum is to be calculated + * @param block_sum array which stores the total sum of each block, requires at + * least `ceildiv(num_elements, block_size)` elements + * @param num_elements total number of entries in `elements` + * + * @tparam block_size thread block size for this kernel, also size of blocks on + * which this kernel calculates the prefix sum in-place + * + * @note To calculate the prefix sum over an array of size bigger than + * `block_size`, `finalize_prefix_sum` has to be used as well. + */ +template +__global__ __launch_bounds__(block_size) void start_prefix_sum( + size_type num_elements, ValueType *__restrict__ elements, + ValueType *__restrict__ block_sum) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto element_id = threadIdx.x; + __shared__ size_type prefix_helper[block_size]; + prefix_helper[element_id] = + (tidx < num_elements) ? elements[tidx] : zero(); + auto this_block = group::this_thread_block(); + this_block.sync(); + + // Do a normal reduction +#pragma unroll + for (int i = 1; i < block_size; i <<= 1) { + const auto ai = i * (2 * element_id + 1) - 1; + const auto bi = i * (2 * element_id + 2) - 1; + if (bi < block_size) { + prefix_helper[bi] += prefix_helper[ai]; + } + this_block.sync(); + } + + if (element_id == 0) { + // Store the total sum + block_sum[blockIdx.x] = prefix_helper[block_size - 1]; + prefix_helper[block_size - 1] = zero(); + } + + this_block.sync(); + + // Perform the down-sweep phase to get the true prefix sum +#pragma unroll + for (int i = block_size >> 1; i > 0; i >>= 1) { + const auto ai = i * (2 * element_id + 1) - 1; + const auto bi = i * (2 * element_id + 2) - 1; + if (bi < block_size) { + auto tmp = prefix_helper[ai]; + prefix_helper[ai] = prefix_helper[bi]; + prefix_helper[bi] += tmp; + } + this_block.sync(); + } + if (tidx < num_elements) { + elements[tidx] = prefix_helper[element_id]; + } +} + + +/** + * @internal + * Second step of the calculation of a prefix sum. Increases the value of each + * entry of `elements` by the total sum of all preceding blocks. + * + * @param elements array on which the prefix sum is to be calculated + * @param block_sum array storing the total sum of each block + * @param num_elements total number of entries in `elements` + * + * @tparam block_size thread block size for this kernel, has to be the same as + * for `start_prefix_sum` + * + * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. + */ +template +__global__ __launch_bounds__(block_size) void finalize_prefix_sum( + size_type num_elements, ValueType *__restrict__ elements, + const ValueType *__restrict__ block_sum) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_elements) { + ValueType prefix_block_sum = zero(); + for (size_type i = 0; i < blockIdx.x; i++) { + prefix_block_sum += block_sum[i]; + } + elements[tidx] += prefix_block_sum; + } +} \ No newline at end of file diff --git a/common/components/reduction.hpp.inc b/common/components/reduction.hpp.inc new file mode 100644 index 00000000000..0bc44e08bb4 --- /dev/null +++ b/common/components/reduction.hpp.inc @@ -0,0 +1,177 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on a group + * `group`. Each thread contributes with one element `local_data`. The local + * thread element is always passed as the first parameter to the `reduce_op`. + * The function returns the result of the reduction on all threads. + * + * @note The function is guaranteed to return the correct value on all threads + * only if `reduce_op` is commutative (in addition to being associative). + * Otherwise, the correct value is returned only to the thread with + * subwarp index 0. + */ +template < + typename Group, typename ValueType, typename Operator, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ ValueType reduce(const Group &group, + ValueType local_data, + Operator reduce_op = Operator{}) +{ +#pragma unroll + for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) { + const auto remote_data = group.shfl_xor(local_data, bitmask); + local_data = reduce_op(local_data, remote_data); + } + return local_data; +} + + +/** + * @internal + * + * Returns the index of the thread that has the element with the largest + * magnitude among all the threads in the group. + * Only the values from threads which set `is_pivoted` to `false` will be + * considered. + */ +template < + typename Group, typename ValueType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ int choose_pivot(const Group &group, + ValueType local_data, + bool is_pivoted) +{ + using real = remove_complex; + real lmag = is_pivoted ? -one() : abs(local_data); + const auto pivot = + reduce(group, group.thread_rank(), [&](int lidx, int ridx) { + const auto rmag = group.shfl(lmag, ridx); + if (rmag > lmag) { + lmag = rmag; + lidx = ridx; + } + return lidx; + }); + // pivot operator not commutative, make sure everyone has the same pivot + return group.shfl(pivot, 0); +} + + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on entire block. + * The data for the reduction is taken from the `data` array which has to be of + * size `block_size` and accessible from all threads. The `data` array is also + * used as work space (so its content will be destroyed in the process), as well + * as to store the return value - which is stored in the 0-th position of the + * array. + */ +template < + typename Group, typename ValueType, typename Operator, + typename = xstd::enable_if_t::value>> +__device__ void reduce(const Group &__restrict__ group, + ValueType *__restrict__ data, + Operator reduce_op = Operator{}) +{ + const auto local_id = group.thread_rank(); + + for (int k = group.size() / 2; k >= config::warp_size; k /= 2) { + group.sync(); + if (local_id < k) { + data[local_id] = reduce_op(data[local_id], data[local_id + k]); + } + } + + const auto warp = group::tiled_partition(group); + const auto warp_id = group.thread_rank() / warp.size(); + if (warp_id > 0) { + return; + } + auto result = reduce(warp, data[warp.thread_rank()], reduce_op); + if (warp.thread_rank() == 0) { + data[0] = result; + } +} + + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on an array + * `source` of any size. Has to be called a second time on `result` to reduce + * an array larger than `block_size`. + */ +template +__device__ void reduce_array(size_type size, + const ValueType *__restrict__ source, + ValueType *__restrict__ result, + Operator reduce_op = Operator{}) +{ + const auto tidx = thread::get_thread_id_flat(); + auto thread_result = zero(); + for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) { + thread_result = reduce_op(thread_result, source[i]); + } + result[threadIdx.x] = thread_result; + + group::this_thread_block().sync(); + + // Stores the result of the reduction inside `result[0]` + reduce(group::this_thread_block(), result, reduce_op); +} + + +/** + * @internal + * + * Computes a reduction using the add operation (+) on an array + * `source` of any size. Has to be called a second time on `result` to reduce + * an array larger than `default_block_size`. + */ +template +__global__ __launch_bounds__(default_block_size) void reduce_add_array( + size_type size, const ValueType *__restrict__ source, + ValueType *__restrict__ result) +{ + __shared__ UninitializedArray block_sum; + reduce_array(size, source, static_cast(block_sum), + [](const ValueType &x, const ValueType &y) { return x + y; }); + + if (threadIdx.x == 0) { + result[blockIdx.x] = block_sum[0]; + } +} diff --git a/common/components/searching.hpp.inc b/common/components/searching.hpp.inc new file mode 100644 index 00000000000..e7e558508f0 --- /dev/null +++ b/common/components/searching.hpp.inc @@ -0,0 +1,238 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * Generic binary search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * The implementation is based on Stepanov & McJones, "Elements of Programming". + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__forceinline__ __device__ IndexType binary_search(IndexType offset, + IndexType length, + Predicate p) +{ + while (length > 0) { + auto half_length = length / 2; + auto mid = offset + half_length; + auto pred = p(mid); + length = pred ? half_length : length - (half_length + 1); + offset = pred ? offset : mid + 1; + } + return offset; +} + + +/** + * @internal + * Generic implementation of a fixed-size binary search. + * The implementation makes sure that the number of predicate evaluations only + * depends on `length` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * @tparam size the length of the partitioned range - must be a power of two + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `int` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__forceinline__ __device__ int synchronous_fixed_binary_search(Predicate p) +{ + if (size == 0) { + return 0; + } + int begin{}; + static_assert(size > 0, "size must be positive"); + static_assert(!(size & (size - 1)), "size must be a power of two"); +#pragma unroll + for (auto cur_size = size; cur_size > 1; cur_size /= 2) { + auto half_size = cur_size / 2; + auto mid = begin + half_size; + // invariant: [begin, begin + cur_size] contains partition point + begin = p(mid) ? begin : mid; + } + // cur_size is now 1, so the partition point is either begin or begin + 1 + return p(begin) ? begin : begin + 1; +} + + +/** + * @internal + * Generic implementation of a synchronous binary search. + * The implementation makes sure that the number of predicate evaluations only + * depends on `length` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * @param size the length of the partitioned range - must be a power of two + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `int` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__forceinline__ __device__ int synchronous_binary_search(int size, Predicate p) +{ + if (size == 0) { + return 0; + } + int begin{}; + for (auto cur_size = size; cur_size > 1; cur_size /= 2) { + auto half_size = cur_size / 2; + auto mid = begin + half_size; + // invariant: [begin, begin + cur_size] contains partition point + begin = p(mid) ? begin : mid; + } + // cur_size is now 1, so the partition point is either begin or begin + 1 + return p(begin) ? begin : begin + 1; +} + + +/** + * @internal + * Generic search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * It executes `log2(length / group.size())` coalescing calls to `p`. + * + * This implementation is based on the w-wide search mentioned in + * Green et al., "GPU merge path: a GPU merging algorithm" + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param group the coalescing group executing the search + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__forceinline__ __device__ IndexType group_wide_search(IndexType offset, + IndexType length, + Group group, Predicate p) +{ + // binary search on the group-sized blocks + IndexType num_blocks = (length + group.size() - 1) / group.size(); + auto group_pos = binary_search(IndexType{}, num_blocks, [&](IndexType i) { + auto idx = i * group.size(); + return p(offset + idx); + }); + // case 1: p is true everywhere: middle is at the beginning + if (group_pos == 0) { + return offset; + } + /* + * case 2: p is false somewhere: + * + * p(group_pos * g.size()) is true, so either this is the partition point, + * or the partition point is one of the g.size() - 1 previous indices. + * |block group_pos-1| + * 0 | 0 * * * * * * * | 1 + * ^ ^ + * we load this range, with the 1 acting as a sentinel for ffs(...) + * + * additionally, this means that we can't call p out-of-bounds + */ + auto base_idx = (group_pos - 1) * group.size() + 1; + auto idx = base_idx + group.thread_rank(); + auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1; + return offset + base_idx + pos; +} + + +/** + * @internal + * Generic search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * It executes `log2(length) / log2(group.size())` calls to `p` that effectively + * follow a random-access pattern. + * + * This implementation is based on the w-partition search mentioned in + * Green et al., "GPU merge path: a GPU merging algorithm" + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param group the coalescing group executing the search + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__forceinline__ __device__ IndexType group_ary_search(IndexType offset, + IndexType length, + Group group, Predicate p) +{ + IndexType end = offset + length; + // invariant: [offset, offset + length] contains middle + while (length > group.size()) { + auto stride = length / group.size(); + auto idx = offset + group.thread_rank() * stride; + auto mask = group.ballot(p(idx)); + // if the mask is 0, the partition point is in the last block + // if the mask is ~0, the partition point is in the first block + // otherwise, we go to the last block that returned a 0. + auto pos = mask == 0 ? group.size() - 1 : ffs(mask >> 1) - 1; + auto last_length = length - stride * (group.size() - 1); + length = pos == group.size() - 1 ? last_length : stride; + offset += stride * pos; + } + auto idx = offset + group.thread_rank(); + // if the mask is 0, the partition point is at the end + // otherwise it is the first set bit + auto mask = group.ballot(idx >= end || p(idx)); + auto pos = mask == 0 ? group.size() : ffs(mask) - 1; + return offset + pos; +} diff --git a/common/components/segment_scan.hpp.inc b/common/components/segment_scan.hpp.inc new file mode 100644 index 00000000000..3aac34832dc --- /dev/null +++ b/common/components/segment_scan.hpp.inc @@ -0,0 +1,63 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +/** + * @internal + * + * Compute a segement scan using add operation (+) of a subwarp. Each segment + * performs suffix sum. Works on the source array and returns whether the thread + * is the first element of its segment with same `ind`. + */ +template +__device__ __forceinline__ bool segment_scan( + const group::thread_block_tile &group, const IndexType ind, + ValueType *__restrict__ val) +{ + bool head = true; +#pragma unroll + for (int i = 1; i < subwarp_size; i <<= 1) { + const IndexType add_ind = group.shfl_up(ind, i); + ValueType add_val = zero(); + if (add_ind == ind && threadIdx.x >= i) { + add_val = *val; + if (i == 1) { + head = false; + } + } + add_val = group.shfl_down(add_val, i); + if (threadIdx.x < subwarp_size - i) { + *val += add_val; + } + } + return head; +} \ No newline at end of file diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc new file mode 100644 index 00000000000..cc042a08d90 --- /dev/null +++ b/common/components/sorting.hpp.inc @@ -0,0 +1,320 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace detail { + + +/** + * @internal + * Bitonic sorting operation for two elements. + * + * @param reverse sorts in ascending order if `false` and + * descending order if `true`. + */ +template +__forceinline__ __device__ void bitonic_cas(ValueType &a, ValueType &b, + bool reverse) +{ + auto tmp = a; + bool cmp = (a < b) != reverse; + a = cmp ? a : b; + b = cmp ? b : tmp; +} + + +/** + * @internal + * This is a recursive implementation of a bitonic sorting network, + * executed sequentially on locally stored data. + * + * Based on Batcher, "Sorting Networks and Their Applications", 1968. + */ +template +struct bitonic_local { + using half = bitonic_local; + static_assert(num_elements > 0, "number of elements must be positive"); + static_assert((num_elements & (num_elements - 1)) == 0, + "number of elements must be a power of two"); + + // merges two bitonic sequences els[0, n / 2), els[n / 2, n) + __forceinline__ __host__ __device__ static void merge(ValueType *els, + bool reverse) + { + auto els_mid = els + (num_elements / 2); + for (auto i = 0; i < num_elements / 2; ++i) { + bitonic_cas(els[i], els_mid[i], reverse); + } + half::merge(els, reverse); + half::merge(els_mid, reverse); + } + + // sorts an unsorted sequence els [0, n) + __forceinline__ __device__ static void sort(ValueType *els, bool reverse) + { + auto els_mid = els + (num_elements / 2); + // sort first half normally + half::sort(els, reverse); + // sort second half reversed + half::sort(els_mid, !reverse); + // merge two halves + merge(els, reverse); + } +}; + +template +struct bitonic_local { + // nothing to do for a single element + __forceinline__ __device__ static void merge(ValueType *, bool) {} + __forceinline__ __device__ static void sort(ValueType *, bool) {} +}; + + +/** + * @internal + * This is a recursive implementation of a bitonic sorting network, + * executed in parallel within a warp using lane shuffle instructions. + * + * Based on Hou et al., "Fast Segmented Sort on GPUs", 2017. + */ +template +struct bitonic_warp { + constexpr static auto num_elements = num_local * num_threads; + using half = bitonic_warp; + static_assert(num_threads > 0, "number of threads must be positive"); + static_assert(num_local > 0, "number of local elements must be positive"); + static_assert( + config::warp_size % num_threads == 0 && + num_threads <= config::warp_size, + "number of threads must be a power of two smaller than warp_size"); + + // check if we are in the upper half of all threads in this group + // this is important as + // 1. for sorting, we have to reverse the sort order in the upper half + // 2. for merging, we have to determine for the XOR shuffle if we are + // the "smaller" thread, as this thread gets the "smaller" element. + __forceinline__ __device__ static bool upper_half() + { + return bool(threadIdx.x & (num_threads / 2)); + } + + __forceinline__ __device__ static void merge(ValueType *els, bool reverse) + { + auto tile = group::thread_block_tile{}; + auto new_reverse = reverse != upper_half(); + for (auto i = 0; i < num_local; ++i) { + auto other = tile.shfl_xor(els[i], num_threads / 2); + bitonic_cas(els[i], other, new_reverse); + } + half::merge(els, reverse); + } + + __forceinline__ __device__ static void sort(ValueType *els, bool reverse) + { + auto new_reverse = reverse != upper_half(); + half::sort(els, new_reverse); + merge(els, reverse); + } +}; + +template +struct bitonic_warp { + using local = bitonic_local; + __forceinline__ __device__ static void merge(ValueType *els, bool reverse) + { + local::merge(els, reverse); + } + __forceinline__ __device__ static void sort(ValueType *els, bool reverse) + { + local::sort(els, reverse); + } +}; + + +/** + * @internal + * This is a recursive implementation of a bitonic sorting network, + * executed in parallel in a thread block using shared memory. + * + * We use a tiled storage pattern to avoid memory bank collisions on shared + * memory accesses, see @ref shared_idx. + */ +template +struct bitonic_global { + constexpr static auto num_elements = num_local * num_threads * num_groups; + using half = bitonic_global; + static_assert(num_groups > 0, "number of groups must be positive"); + static_assert(num_threads > 0, + "number of threads per group must be positive"); + static_assert(num_local > 0, "number of local elements must be positive"); + static_assert(num_total_threads > 0, "number of threads must be positive"); + static_assert(32 % num_groups == 0, + "num_groups must be a power of two <= 32"); + + __forceinline__ __device__ static int shared_idx(int local) + { + auto rank = group::this_thread_block().thread_rank(); + // use the same memory-bank to avoid bank conflicts + return rank + local * num_total_threads; + } + + // check if we are in the upper half of all groups in this block + // this is important as for sorting, we have to reverse the sort order in + // the upper half + __forceinline__ __device__ static bool upper_half() + { + auto rank = group::this_thread_block().thread_rank(); + return bool(rank & (num_groups * num_threads / 2)); + } + + __forceinline__ __device__ static void merge(ValueType *local_els, + ValueType *shared_els, + bool reverse) + { + group::this_thread_block().sync(); + auto upper_shared_els = shared_els + (num_groups * num_threads / 2); + // only the lower group executes the CAS + if (!upper_half()) { + for (auto i = 0; i < num_local; ++i) { + auto j = shared_idx(i); + bitonic_cas(shared_els[j], upper_shared_els[j], reverse); + } + } + half::merge(local_els, shared_els, reverse); + } + + __forceinline__ __device__ static void sort(ValueType *local_els, + ValueType *shared_els, + bool reverse) + { + auto new_reverse = reverse != upper_half(); + half::sort(local_els, shared_els, new_reverse); + merge(local_els, shared_els, reverse); + } +}; + +template +struct bitonic_global { + using warp = bitonic_warp; + + __forceinline__ __device__ static int shared_idx(int local) + { + // use the indexing from the general struct + return bitonic_global::shared_idx(local); + } + + __forceinline__ __device__ static void merge(ValueType *local_els, + ValueType *shared_els, + bool reverse) + { + group::this_thread_block().sync(); + for (auto i = 0; i < num_local; ++i) { + local_els[i] = shared_els[shared_idx(i)]; + } + warp::merge(local_els, reverse); + for (auto i = 0; i < num_local; ++i) { + shared_els[shared_idx(i)] = local_els[i]; + } + } + + __forceinline__ __device__ static void sort(ValueType *local_els, + ValueType *shared_els, + bool reverse) + { + auto rank = group::this_thread_block().thread_rank(); + // This is the first step, so we don't need to load from shared memory + warp::sort(local_els, reverse); + // store the sorted elements in shared memory + for (auto i = 0; i < num_local; ++i) { + shared_els[shared_idx(i)] = local_els[i]; + } + } +}; + + +} // namespace detail + + +/** + * @internal + * + * This function sorts elements within a thread block. + * + * It takes a local array of elements and the pointer to a shared buffer of size + * `num_elements` as input. After the execution, the thread with rank `i` in the + * thread block (determined by `group::this_thread_block().thread_rank()`) has + * the elements at index `num_local * i` up to `num_local * i + (num_local - 1)` + * in the sorted sequence stored in its `local_elements` at index 0 up to + * `num_local - 1`. + * + * @note The shared-memory buffer uses a striped layout to limit bank + * collisions, so it should not directly be used to access elements from + * the sorted sequence. If `num_elements <= num_local * warp_size`, the + * algorithm doesn't use/need the shared-memory buffer, so it can be null. + * + * @param local_elements the `num_local` input/output elements from this + * thread. + * @param shared_elements the shared-memory buffer of size `num_elements` + * @tparam num_elements the number of elements - it must be a power of two! + * @tparam num_local the number of elements stored per thread - it must be a + * power of two! + * @tparam ValueType the type of the elements to be sorted - it must implement + * the less-than operator! + */ +template +__forceinline__ __device__ void bitonic_sort(ValueType *local_elements, + ValueType *shared_elements) +{ + constexpr auto num_threads = num_elements / num_local; + constexpr auto num_warps = num_threads / config::warp_size; + static_assert(num_threads <= config::max_block_size, + "bitonic_sort exceeds thread block"); + if (num_warps > 1) { + // these checks are necessary since the `if` is not evaluated at + // compile-time so even though the branch is never taken, it still gets + // instantiated and must thus compile. + constexpr auto _num_warps = num_warps <= 1 ? 1 : num_warps; + constexpr auto _num_threads = + num_threads <= config::warp_size ? config::warp_size : num_threads; + detail::bitonic_global::sort(local_elements, + shared_elements, + false); + } else { + constexpr auto _num_threads = num_warps > 1 ? 1 : num_threads; + detail::bitonic_warp::sort( + local_elements, false); + } +} diff --git a/common/components/thread_ids.hpp.inc b/common/components/thread_ids.hpp.inc new file mode 100644 index 00000000000..3a28dad5326 --- /dev/null +++ b/common/components/thread_ids.hpp.inc @@ -0,0 +1,272 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * + * Returns the ID of the block group this thread belongs to. + * + * @return the ID of the block group this thread belongs to + * + * @note Assumes that grid dimensions are in standard format: + * `(block_group_size, first_grid_dimension, second grid_dimension)` + */ +__device__ __forceinline__ size_type get_block_group_id() +{ + return static_cast(blockIdx.z) * gridDim.y + blockIdx.y; +} + +/** + * @internal + * + * Returns the ID of the block this thread belongs to. + * + * @return the ID of the block this thread belongs to + * + * @note Assumes that grid dimensions are in standard format: + * `(block_group_size, first_grid_dimension, second grid_dimension)` + */ +__device__ __forceinline__ size_type get_block_id() +{ + return get_block_group_id() * gridDim.x + blockIdx.x; +} + + +/** + * @internal + * + * Returns the local ID of the warp (relative to the block) this thread belongs + * to. + * + * @return the local ID of the warp (relative to the block) this thread belongs + * to + * + * @note Assumes that block dimensions are in standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +__device__ __forceinline__ size_type get_local_warp_id() +{ + return static_cast(threadIdx.z); +} + + +/** + * @internal + * + * Returns the local ID of the sub-warp (relative to the block) this thread + * belongs to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the local ID of the sub-warp (relative to the block) this thread + * belongs to + * + * @note Assumes that block dimensions are in standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +template +__device__ __forceinline__ size_type get_local_subwarp_id() +{ + constexpr auto subwarps_per_warp = config::warp_size / subwarp_size; + return get_local_warp_id() * subwarps_per_warp + threadIdx.y; +} + + +/** + * @internal + * + * Returns the local ID of the thread (relative to the block). + * to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the local ID of the thread (relative to the block) + * + * @note Assumes that block dimensions are in standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +template +__device__ __forceinline__ size_type get_local_thread_id() +{ + return get_local_subwarp_id() * subwarp_size + threadIdx.x; +} + + +/** + * @internal + * + * Returns the global ID of the warp this thread belongs to. + * + * @tparam warps_per_block number of warps within each block + * + * @return the global ID of the warp this thread belongs to. + * + * @note Assumes that block dimensions and grid dimensions are in standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__device__ __forceinline__ size_type get_warp_id() +{ + return get_block_id() * warps_per_block + get_local_warp_id(); +} + + +/** + * @internal + * + * Returns the global ID of the sub-warp this thread belongs to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the global ID of the sub-warp this thread belongs to. + * + * @note Assumes that block dimensions and grid dimensions are in standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__device__ __forceinline__ size_type get_subwarp_id() +{ + constexpr auto subwarps_per_warp = config::warp_size / subwarp_size; + return get_warp_id() * subwarps_per_warp + threadIdx.y; +} + + +/** + * @internal + * + * Returns the global ID of the thread. + * + * @return the global ID of the thread. + * + * @tparam subwarp_size size of the subwarp + * + * @note Assumes that block dimensions and grid dimensions are in standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__device__ __forceinline__ size_type get_thread_id() +{ + return get_subwarp_id() * subwarp_size + + threadIdx.x; +} + + +/** + * @internal + * + * Returns the global ID of the thread in the given index type. + * This function assumes one-dimensional thread and block indexing. + * + * @return the global ID of the thread in the given index type. + * + * @tparam IndexType the index type + */ +template +__device__ __forceinline__ IndexType get_thread_id_flat() +{ + return threadIdx.x + static_cast(blockDim.x) * blockIdx.x; +} + + +/** + * @internal + * + * Returns the total number of threads in the given index type. + * This function assumes one-dimensional thread and block indexing. + * + * @return the total number of threads in the given index type. + * + * @tparam IndexType the index type + */ +template +__device__ __forceinline__ IndexType get_thread_num_flat() +{ + return blockDim.x * static_cast(gridDim.x); +} + + +/** + * @internal + * + * Returns the global ID of the subwarp in the given index type. + * This function assumes one-dimensional thread and block indexing + * with a power of two block size of at least subwarp_size. + * + * @return the global ID of the subwarp in the given index type. + * + * @tparam subwarp_size the size of the subwarp. Must be a power of two! + * @tparam IndexType the index type + */ +template +__device__ __forceinline__ IndexType get_subwarp_id_flat() +{ + static_assert(!(subwarp_size & (subwarp_size - 1)), + "subwarp_size must be a power of two"); + return threadIdx.x / subwarp_size + + static_cast(blockDim.x / subwarp_size) * blockIdx.x; +} + + +/** + * @internal + * + * Returns the total number of subwarps in the given index type. + * This function assumes one-dimensional thread and block indexing + * with a power of two block size of at least subwarp_size. + * + * @return the total number of subwarps in the given index type. + * + * @tparam subwarp_size the size of the subwarp. Must be a power of two! + * @tparam IndexType the index type + */ +template +__device__ __forceinline__ IndexType get_subwarp_num_flat() +{ + static_assert(!(subwarp_size & (subwarp_size - 1)), + "subwarp_size must be a power of two"); + return blockDim.x / subwarp_size * static_cast(gridDim.x); +} \ No newline at end of file diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc new file mode 100644 index 00000000000..ced072c40f4 --- /dev/null +++ b/common/components/uninitialized_array.hpp.inc @@ -0,0 +1,93 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +/** + * Stores an array with uninitialized contents. + * + * This class needed for datatypes that do have a non-empty constructor when` + * using them as shared memory, for example `thrust::complex`. + * + * @tparam ValueType the type of values + * @tparam size the size of the array + */ +template +class UninitializedArray { +public: + /** + * Operator for casting an UninitializedArray into its constexpr value + * pointer. + * + * @return the constexpr pointer to the first entry of the array. + */ + constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept + { + return &(*this)[0]; + } + + /** + * Operator for casting an UninitializedArray into its non-const value + * pointer. + * + * @return the non-const pointer to the first entry of the array. + */ + GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; } + + /** + * constexpr array access operator. + * + * @param pos The array index. Using a value outside [0, size) is undefined + * behavior. + * + * @return a reference to the array entry at the given index. + */ + constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept + { + return reinterpret_cast(data_)[pos]; + } + + /** + * Non-const array access operator. + * + * @param pos The array index. Using a value outside [0, size) is undefined + * behavior. + * + * @return a reference to the array entry at the given index. + */ + GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept + { + return reinterpret_cast(data_)[pos]; + } + +private: + unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; +}; diff --git a/common/components/warp_blas.hpp.inc b/common/components/warp_blas.hpp.inc new file mode 100644 index 00000000000..d99b009d9bb --- /dev/null +++ b/common/components/warp_blas.hpp.inc @@ -0,0 +1,391 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @internal + * + * Defines a postprocessing transformation that should be performed on the + * result of a function call. + * + * @note This functionality should become useless once accessors and ranges are + * in place, as they will define the storage scheme. + */ +enum postprocess_transformation { and_return, and_transpose }; + + +/** + * @internal + * + * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan + * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using the + * thread group `group. Each thread contributes one `row` of the matrix, and the + * routine uses warp shuffles to exchange data between rows. The transform is + * performed by using the `key_row`-th row and `key_col`-th column of the + * matrix. + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ void apply_gauss_jordan_transform( + const Group &__restrict__ group, int32 key_row, int32 key_col, + ValueType *__restrict__ row, bool &__restrict__ status) +{ + auto key_col_elem = group.shfl(row[key_col], key_row); + if (key_col_elem == zero()) { + // TODO: implement error handling for GPUs to be able to properly + // report it here + status = false; + return; + } + if (group.thread_rank() == key_row) { + key_col_elem = one() / key_col_elem; + } else { + key_col_elem = -row[key_col] / key_col_elem; + } +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + const auto key_row_elem = group.shfl(row[i], key_row); + if (group.thread_rank() == key_row) { + row[i] = zero(); + } + row[i] += key_col_elem * key_row_elem; + } + row[key_col] = key_col_elem; +} + + +/** + * @internal + * + * Inverts a matrix using Gauss-Jordan elimination. The inversion is + * done in-place, so the original matrix will be overridden with the inverse. + * The inversion routine uses implicit pivoting, so the returned matrix will be + * a permuted inverse (from both sides). To obtain the correct inverse, the + * rows of the result should be permuted with $P$, and the columns with + * $ P^T $ (i.e. $ A^{-1} = P X P $, where $ X $ is the returned matrix). These + * permutation matrices are returned compressed as vectors `perm` + * and`trans_perm`, respectively. `i`-th value of each of the vectors is + * returned to thread of the group with rank `i`. + * + * @tparam max_problem_size the maximum problem size that will be passed to the + * inversion routine (a tighter bound results in + * faster code + * @tparam Group type of the group of threads + * @tparam ValueType type of values stored in the matrix + * + * @param group the group of threads which participate in the inversion + * @param problem_size the actual size of the matrix (cannot be larger than + * max_problem_size) + * @param row a pointer to the matrix row (i-th thread in the group should + * pass the pointer to the i-th row), has to have at least + * max_problem_size elements + * @param perm a value to hold an element of permutation matrix $ P $ + * @param trans_perm a value to hold an element of permutation matrix $ P^T $ + * + * @return true if the inversion succeeded, false otherwise + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ bool invert_block(const Group &__restrict__ group, + uint32 problem_size, + ValueType *__restrict__ row, + uint32 &__restrict__ perm, + uint32 &__restrict__ trans_perm) +{ + GKO_ASSERT(problem_size <= max_problem_size); + // prevent rows after problem_size to become pivots + auto pivoted = group.thread_rank() >= problem_size; + auto status = true; +#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS +#pragma unroll +#else +#pragma unroll 1 +#endif + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + const auto piv = choose_pivot(group, row[i], pivoted); + if (group.thread_rank() == piv) { + perm = i; + pivoted = true; + } + if (group.thread_rank() == i) { + trans_perm = piv; + } + apply_gauss_jordan_transform(group, piv, i, row, + status); + } + } + return status; +} + + +/** + * @internal + * + * Performs the correct index calculation for the given postprocess operation. + */ +template +__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col, + T3 stride) -> + typename std::enable_if< + mod != and_transpose, + typename std::decay::type>::type +{ + return row * stride + col; +} + + +template +__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col, + T3 stride) -> + typename std::enable_if< + mod == and_transpose, + typename std::decay::type>::type +{ + return col * stride + row; +} + + +/** + * @internal + * + * Copies a matrix stored as a collection of rows in different threads of the + * warp in a block of memory accessible by all threads in row-major order. + * Optionally permutes rows and columns of the matrix in the process. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam mod the transformation to perform on the return data + * @tparam Group type of the group of threads + * @tparam SourceValueType type of values stored in the source matrix + * @tparam ResultValueType type of values stored in the result matrix + * + * @param group group of threads participating in the copy + * @param problem_size actual size of the matrix + * (`problem_size <= max_problem_size`) + * @param source_row pointer to memory used to store a row of the source matrix + * `i`-th thread of the sub-warp should pass in the `i`-th + * row of the matrix + * @param increment offset between two consecutive elements of the row + * @param row_perm permutation vector to apply on the rows of the matrix + * (thread `i` supplies the `i`-th value of the vector) + * @param col_perm permutation vector to apply on the column of the matrix + * (thread `i` supplies the `i`-th value of the vector) + * @param destination pointer to memory where the result will be stored + * (all threads supply the same value) + * @param stride offset between two consecutive rows of the matrix + */ +template < + int max_problem_size, postprocess_transformation mod = and_return, + typename Group, typename SourceValueType, typename ResultValueType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ void copy_matrix( + const Group &__restrict__ group, uint32 problem_size, + const SourceValueType *__restrict__ source_row, uint32 increment, + uint32 row_perm, uint32 col_perm, ResultValueType *__restrict__ destination, + size_type stride) +{ + GKO_ASSERT(problem_size <= max_problem_size); +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + const auto idx = group.shfl(col_perm, i); + if (group.thread_rank() < problem_size) { + // Need to assign a variable for the source_row, or hip + // will use a lot of VGPRs in unroll. This might lead to + // problems. + const auto val = source_row[i * increment]; + destination[get_row_major_index(idx, row_perm, stride)] = + static_cast(val); + } + } + } +} + + +/** + * @internal + * + * Multiplies a transposed vector and a matrix stored in column-major order. + * + * In mathematical terms, performs the operation $ res^T = vec^T \cdot mtx$. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam Group type of the group of threads + * @tparam MatrixValueType type of values stored in the matrix + * @tparam VectorValueType type of values stored in the vectors + * + * @param group group of threads participating in the operation + * @param problem_size actual size of the matrix + * (`problem_size <= max_problem_size`) + * @param vec input vector to multiply (thread `i` supplies the `i`-th value of + * the vector) + * @param mtx_row pointer to memory used to store a row of the input matrix, + * `i`-th thread of the sub-warp should pass in the + * `i`-th row of the matrix + * @param mtx_increment offset between two consecutive elements of the row + * @param res pointer to a block of memory where the result will be written + * (only thread 0 of the group has to supply a valid value) + * @param mtx_increment offset between two consecutive elements of the result + */ +template < + int max_problem_size, typename Group, typename MatrixValueType, + typename VectorValueType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ void multiply_transposed_vec( + const Group &__restrict__ group, uint32 problem_size, + const VectorValueType &__restrict__ vec, + const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment, + VectorValueType *__restrict__ res, uint32 res_increment) +{ + GKO_ASSERT(problem_size <= max_problem_size); + auto mtx_elem = zero(); +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + if (group.thread_rank() < problem_size) { + mtx_elem = + static_cast(mtx_row[i * mtx_increment]); + } + const auto out = reduce( + group, mtx_elem * vec, + [](VectorValueType x, VectorValueType y) { return x + y; }); + if (group.thread_rank() == 0) { + res[i * res_increment] = out; + } + } + } +} + + +/** + * @internal + * + * Multiplies a matrix and a vector stored in column-major order. + * + * In mathematical terms, performs the operation $res = mtx \cdot vec$. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam Group type of the group of threads + * @tparam MatrixValueType type of values stored in the matrix + * @tparam VectorValueType type of values stored in the vectors + * @tparam Closure type of the function used to write the result + * + * @param group group of threads participating in the operation + * @param problem_size actual size of the matrix + * (`problem_size <= max_problem_size`) + * @param vec input vector to multiply (thread `i` supplies the `i`-th value of + * the vector) + * @param mtx_row pointer to memory used to store a row of the input matrix, + * `i`-th thread of the sub-warp should pass in the + * `i`-th row of the matrix + * @param mtx_increment offset between two consecutive elements of the row + * @param res pointer to a block of memory where the result will be written + * (only thread 0 of the group has to supply a valid value) + * @param mtx_increment offset between two consecutive elements of the result + * @param closure_op Operation that is performed when writing to + `res[group.thread_rank() * res_increment]` as + `closure_op(res[group.thread_rank() * res_increment], out)` + where `out` is the result of $mtx \cdot vec$. + */ +template < + int max_problem_size, typename Group, typename MatrixValueType, + typename VectorValueType, typename Closure, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ void multiply_vec( + const Group &__restrict__ group, uint32 problem_size, + const VectorValueType &__restrict__ vec, + const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment, + VectorValueType *__restrict__ res, uint32 res_increment, Closure closure_op) +{ + GKO_ASSERT(problem_size <= max_problem_size); + auto mtx_elem = zero(); + auto out = zero(); +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + if (group.thread_rank() < problem_size) { + mtx_elem = + static_cast(mtx_row[i * mtx_increment]); + } + out += mtx_elem * group.shfl(vec, i); + } + } + if (group.thread_rank() < problem_size) { + closure_op(res[group.thread_rank() * res_increment], out); + } +} + + +/** + * @internal + * + * Computes the infinity norm of a matrix. Each thread in the group supplies + * one row of the matrix. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam Group type of the group of threads + * @tparam ValueType type of values stored in the matrix + * + * @param group group of threads participating in the operation + * @param num_rows number of rows of the matrix + * (`num_rows <= max_problem_size`) + * @param num_cols number of columns of the matrix + * @param row pointer to memory used to store a row of the input matrix, + * `i`-th thread of the group should pass in the `i`-th row of the + * matrix + * + * @return the infinity norm of the matrix + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = xstd::enable_if_t::value>> +__device__ __forceinline__ remove_complex compute_infinity_norm( + const Group &group, uint32 num_rows, uint32 num_cols, const ValueType *row) +{ + using result_type = remove_complex; + auto sum = zero(); + if (group.thread_rank() < num_rows) { +#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS +#pragma unroll +#else +#pragma unroll 1 +#endif + for (uint32 i = 0; i < max_problem_size; ++i) { + if (i < num_cols) { + sum += abs(row[i]); + } + } + } + return reduce(group, sum, + [](result_type x, result_type y) { return max(x, y); }); +} diff --git a/common/factorization/factorization_kernels.hpp.inc b/common/factorization/factorization_kernels.hpp.inc new file mode 100644 index 00000000000..7050c5ce116 --- /dev/null +++ b/common/factorization/factorization_kernels.hpp.inc @@ -0,0 +1,364 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +namespace detail { + + +// Default implementation for the unsorted case +template +struct find_helper { + template + static __forceinline__ __device__ bool find(Group subwarp_grp, + const IndexType *first, + const IndexType *last, + IndexType value) + { + auto subwarp_idx = subwarp_grp.thread_rank(); + bool found{false}; + for (auto curr_start = first; curr_start < last; + curr_start += subwarp_grp.size()) { + const auto curr = curr_start + subwarp_idx; + found = (curr < last && *curr == value); + found = subwarp_grp.any(found); + if (found) { + break; + } + } + return found; + } +}; + + +// Improved version in case the CSR matrix is sorted +template <> +struct find_helper { + template + static __forceinline__ __device__ bool find(Group subwarp_grp, + const IndexType *first, + const IndexType *last, + IndexType value) + { + const auto length = static_cast(last - first); + const auto pos = + group_wide_search(IndexType{}, length, subwarp_grp, + [&](IndexType i) { return first[i] >= value; }); + return pos < length && first[pos] == value; + } +}; + + +} // namespace detail + + +// SubwarpSize needs to be a power of 2 +// Each subwarp works on one row +template +__global__ + __launch_bounds__(default_block_size) void find_missing_diagonal_elements( + IndexType num_rows, IndexType num_cols, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, + IndexType *__restrict__ elements_to_add_per_row, + bool *__restrict__ changes_required) +{ + const auto total_subwarp_count = + thread::get_subwarp_num_flat(); + const auto begin_row = + thread::get_subwarp_id_flat(); + + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_idx = subwarp_grp.thread_rank(); + + bool local_change{false}; + for (auto row = begin_row; row < num_rows; row += total_subwarp_count) { + if (row >= num_cols) { + if (subwarp_idx == 0) { + elements_to_add_per_row[row] = 0; + } + continue; + } + const auto *start_cols = col_idxs + row_ptrs[row]; + const auto *end_cols = col_idxs + row_ptrs[row + 1]; + if (detail::find_helper::find(subwarp_grp, start_cols, + end_cols, row)) { + if (subwarp_idx == 0) { + elements_to_add_per_row[row] = 0; + } + } else { + if (subwarp_idx == 0) { + elements_to_add_per_row[row] = 1; + } + local_change = true; + } + } + // Could also be reduced (not sure if that leads to a performance benefit) + if (local_change && subwarp_idx == 0) { + *changes_required = true; + } +} + + +// SubwarpSize needs to be a power of 2 +// Each subwarp works on one row +template +__global__ + __launch_bounds__(default_block_size) void add_missing_diagonal_elements( + IndexType num_rows, const ValueType *__restrict__ old_values, + const IndexType *__restrict__ old_col_idxs, + const IndexType *__restrict__ old_row_ptrs, + ValueType *__restrict__ new_values, + IndexType *__restrict__ new_col_idxs, + const IndexType *__restrict__ row_ptrs_addition) +{ + // Precaution in case not enough threads were created + const auto total_subwarp_count = + thread::get_subwarp_num_flat(); + const auto begin_row = + thread::get_subwarp_id_flat(); + + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_idx = subwarp_grp.thread_rank(); + + for (auto row = begin_row; row < num_rows; row += total_subwarp_count) { + const IndexType old_row_start{old_row_ptrs[row]}; + const IndexType old_row_end{old_row_ptrs[row + 1]}; + const IndexType new_row_start{old_row_start + row_ptrs_addition[row]}; + const IndexType new_row_end{old_row_end + row_ptrs_addition[row + 1]}; + + // if no element needs to be added, do a simple copy of the whole row + if (new_row_end - new_row_start == old_row_end - old_row_start) { + for (IndexType i = subwarp_idx; i < new_row_end - new_row_start; + i += SubwarpSize) { + const IndexType new_idx = new_row_start + i; + const IndexType old_idx = old_row_start + i; + new_values[new_idx] = old_values[old_idx]; + new_col_idxs[new_idx] = old_col_idxs[old_idx]; + } + } else { + IndexType new_idx = new_row_start + subwarp_idx; + bool diagonal_added{false}; + for (IndexType old_idx_start = old_row_start; + old_idx_start < old_row_end; + old_idx_start += SubwarpSize, new_idx += SubwarpSize) { + const auto old_idx = old_idx_start + subwarp_idx; + bool thread_is_active = old_idx < old_row_end; + const auto col_idx = + thread_is_active ? old_col_idxs[old_idx] : IndexType{}; + // automatically false if thread is not active + bool diagonal_add_required = !diagonal_added && row < col_idx; + auto ballot = subwarp_grp.ballot(diagonal_add_required); + + if (ballot) { + auto first_subwarp_idx = ffs(ballot) - 1; + if (first_subwarp_idx == subwarp_idx) { + new_values[new_idx] = zero(); + new_col_idxs[new_idx] = row; + } + if (thread_is_active) { + // if diagonal was inserted in a thread below this one, + // add it to the new_idx. + bool is_thread_after_diagonal = + (first_subwarp_idx <= subwarp_idx); + new_idx += is_thread_after_diagonal; + new_values[new_idx] = old_values[old_idx]; + new_col_idxs[new_idx] = col_idx; + // if diagonal is inserted in a thread after this one, + // it needs to be considered after writing the values + new_idx += !is_thread_after_diagonal; + } + diagonal_added = true; + } else if (thread_is_active) { + new_values[new_idx] = old_values[old_idx]; + new_col_idxs[new_idx] = col_idx; + } + } + if (!diagonal_added && subwarp_idx == 0) { + new_idx = new_row_end - 1; + new_values[new_idx] = zero(); + new_col_idxs[new_idx] = row; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void update_row_ptrs( + IndexType num_rows, IndexType *__restrict__ row_ptrs, + IndexType *__restrict__ row_ptr_addition) +{ + const auto total_thread_count = thread::get_thread_num_flat(); + const auto begin_row = thread::get_thread_id_flat(); + + for (auto row = begin_row; row < num_rows; row += total_thread_count) { + row_ptrs[row] += row_ptr_addition[row]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row, + IndexType *__restrict__ u_nnz_row) +{ + const auto row = thread::get_thread_id_flat(); + if (row < num_rows) { + IndexType l_row_nnz{}; + IndexType u_row_nnz{}; + for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) { + auto col = col_idxs[idx]; + // skip diagonal + l_row_nnz += (col < row); + u_row_nnz += (row < col); + } + // add the diagonal entry + l_nnz_row[row] = l_row_nnz + 1; + u_nnz_row[row] = u_row_nnz + 1; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void initialize_l_u( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, + const IndexType *__restrict__ l_row_ptrs, + IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values, + const IndexType *__restrict__ u_row_ptrs, + IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values) +{ + const auto row = thread::get_thread_id_flat(); + if (row < num_rows) { + auto l_idx = l_row_ptrs[row]; + auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately + // default diagonal to one + auto diag_val = one(); + for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { + const auto col = col_idxs[i]; + const auto val = values[i]; + // save diagonal entry for later + if (col == row) { + diag_val = val; + } + if (col < row) { + l_col_idxs[l_idx] = col; + l_values[l_idx] = val; + ++l_idx; + } + if (row < col) { + u_col_idxs[u_idx] = col; + u_values[u_idx] = val; + ++u_idx; + } + } + // store diagonal entries + auto l_diag_idx = l_row_ptrs[row + 1] - 1; + auto u_diag_idx = u_row_ptrs[row]; + l_col_idxs[l_diag_idx] = row; + u_col_idxs[u_diag_idx] = row; + l_values[l_diag_idx] = one(); + u_values[u_diag_idx] = diag_val; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row) +{ + const auto row = thread::get_thread_id_flat(); + if (row < num_rows) { + IndexType l_row_nnz{}; + for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) { + auto col = col_idxs[idx]; + // skip the diagonal entry + l_row_nnz += col < row; + } + // add the diagonal entry + l_nnz_row[row] = l_row_nnz + 1; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void initialize_l( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, + const IndexType *__restrict__ l_row_ptrs, + IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values, + bool use_sqrt) +{ + const auto row = thread::get_thread_id_flat(); + if (row < num_rows) { + auto l_idx = l_row_ptrs[row]; + // if there was no diagonal entry, default to one + auto diag_val = one(); + for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { + const auto col = col_idxs[i]; + const auto val = values[i]; + // save diagonal entry for later + if (col == row) { + diag_val = val; + } + if (col < row) { + l_col_idxs[l_idx] = col; + l_values[l_idx] = val; + ++l_idx; + } + } + // store diagonal entries + auto l_diag_idx = l_row_ptrs[row + 1] - 1; + l_col_idxs[l_diag_idx] = row; + // compute square root with sentinel + if (use_sqrt) { + diag_val = sqrt(diag_val); + if (!is_finite(diag_val)) { + diag_val = one(); + } + } + l_values[l_diag_idx] = diag_val; + } +} + + +} // namespace kernel diff --git a/common/factorization/par_ict_spgeam_kernels.hpp.inc b/common/factorization/par_ict_spgeam_kernels.hpp.inc new file mode 100644 index 00000000000..7a9febf3f03 --- /dev/null +++ b/common/factorization/par_ict_spgeam_kernels.hpp.inc @@ -0,0 +1,237 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_nnz( + const IndexType *__restrict__ llt_row_ptrs, + const IndexType *__restrict__ llt_col_idxs, + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + IndexType *__restrict__ l_new_row_ptrs, IndexType num_rows) +{ + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + + auto llt_begin = llt_row_ptrs[row]; + auto llt_size = llt_row_ptrs[row + 1] - llt_begin; + auto a_begin = a_row_ptrs[row]; + auto a_size = a_row_ptrs[row + 1] - a_begin; + IndexType count{}; + group_merge( + a_col_idxs + a_begin, a_size, llt_col_idxs + llt_begin, llt_size, + subwarp, + [&](IndexType a_nz, IndexType a_col, IndexType llt_nz, + IndexType llt_col, IndexType out_nz, bool valid) { + auto col = min(a_col, llt_col); + // count the number of unique elements being merged + count += + popcnt(subwarp.ballot(col <= row && a_col != llt_col && valid)); + return true; + }); + if (subwarp.thread_rank() == 0) { + l_new_row_ptrs[row] = count; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init( + const IndexType *__restrict__ llt_row_ptrs, + const IndexType *__restrict__ llt_col_idxs, + const ValueType *__restrict__ llt_vals, + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + const ValueType *__restrict__ a_vals, + const IndexType *__restrict__ l_row_ptrs, + const IndexType *__restrict__ l_col_idxs, + const ValueType *__restrict__ l_vals, + const IndexType *__restrict__ l_new_row_ptrs, + IndexType *__restrict__ l_new_col_idxs, ValueType *__restrict__ l_new_vals, + IndexType num_rows) +{ + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + + auto lane = static_cast(subwarp.thread_rank()); + auto lanemask_eq = config::lane_mask_type{1} << lane; + auto lanemask_lt = lanemask_eq - 1; + + // merge lower triangle of A, L*L^T (and L) + auto l_begin = l_row_ptrs[row]; + auto l_end = l_row_ptrs[row + 1]; + + auto llt_begin = llt_row_ptrs[row]; + auto llt_end = llt_row_ptrs[row + 1]; + auto llt_size = llt_end - llt_begin; + + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto a_size = a_end - a_begin; + + IndexType out_begin{}; + auto out_size = llt_size + a_size; + + IndexType l_new_begin = l_new_row_ptrs[row]; + + constexpr auto sentinel = device_numeric_limits::max; + // load column indices and values for the first merge step + auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel); + auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero()); + auto llt_col = + checked_load(llt_col_idxs, llt_begin + lane, llt_end, sentinel); + auto llt_val = + checked_load(llt_vals, llt_begin + lane, llt_end, zero()); + auto l_col = checked_load(l_col_idxs, l_begin + lane, l_end, sentinel); + auto l_val = checked_load(l_vals, l_begin + lane, l_end, zero()); + bool skip_first{}; + while (out_begin < out_size) { + // merge subwarp.size() elements from A and L*L^T + auto merge_result = + group_merge_step(a_col, llt_col, subwarp); + auto a_cur_col = merge_result.a_val; + auto llt_cur_col = merge_result.b_val; + auto a_cur_val = subwarp.shfl(a_val, merge_result.a_idx); + auto llt_cur_val = subwarp.shfl(llt_val, merge_result.b_idx); + auto valid = out_begin + lane < out_size; + // check if the previous thread has matching columns + auto equal_mask = subwarp.ballot(a_cur_col == llt_cur_col && valid); + auto prev_equal_mask = equal_mask << 1 | skip_first; + skip_first = bool(equal_mask >> (subwarp_size - 1)); + auto prev_equal = bool(prev_equal_mask & lanemask_eq); + + auto r_col = min(a_cur_col, llt_cur_col); + // find matching entry of L + // S(L) is a subset of S(A - L * L^T) since L has a diagonal + auto l_source = synchronous_fixed_binary_search( + [&](int i) { return subwarp.shfl(l_col, i) >= r_col; }); + auto l_cur_col = subwarp.shfl(l_col, l_source); + auto l_cur_val = subwarp.shfl(l_val, l_source); + + // determine actual values of A and L*L^T at r_col + if (r_col != a_cur_col) { + a_cur_val = zero(); + } + if (r_col != llt_cur_col) { + llt_cur_val = zero(); + } + auto r_val = a_cur_val - llt_cur_val; + + // early return when reaching the upper diagonal + if (subwarp.all(r_col > row)) { + break; + } + + // determine which threads will write output to L + auto use_l = l_cur_col == r_col; + auto do_write = !prev_equal && valid && r_col <= row; + auto l_new_advance_mask = subwarp.ballot(do_write); + // store values + if (do_write) { + auto diag = l_vals[l_row_ptrs[r_col + 1] - 1]; + auto out_val = use_l ? l_cur_val : r_val / diag; + auto ofs = popcnt(l_new_advance_mask & lanemask_lt); + l_new_col_idxs[l_new_begin + ofs] = r_col; + l_new_vals[l_new_begin + ofs] = out_val; + } + + // advance *_begin offsets + auto a_advance = merge_result.a_advance; + auto llt_advance = merge_result.b_advance; + auto l_advance = popcnt(subwarp.ballot(do_write && use_l)); + auto l_new_advance = popcnt(l_new_advance_mask); + a_begin += a_advance; + llt_begin += llt_advance; + l_begin += l_advance; + l_new_begin += l_new_advance; + out_begin += subwarp_size; + + // shuffle the unmerged elements to the front + a_col = subwarp.shfl_down(a_col, a_advance); + a_val = subwarp.shfl_down(a_val, a_advance); + llt_col = subwarp.shfl_down(llt_col, llt_advance); + llt_val = subwarp.shfl_down(llt_val, llt_advance); + l_col = subwarp.shfl_down(l_col, l_advance); + l_val = subwarp.shfl_down(l_val, l_advance); + /* + * To optimize memory access, we load the new elements for `a` and `llt` + * with a single load instruction: + * the lower part of the group loads new elements for `a` + * the upper part of the group loads new elements for `llt` + * `load_lane` is the part-local lane idx + * The elements for `a` have to be shuffled up afterwards. + */ + auto load_a = lane < a_advance; + auto load_lane = load_a ? lane : lane - a_advance; + auto load_source_col = load_a ? a_col_idxs : llt_col_idxs; + auto load_source_val = load_a ? a_vals : llt_vals; + auto load_begin = + load_a ? a_begin + llt_advance : llt_begin + a_advance; + auto load_end = load_a ? a_end : llt_end; + + auto load_idx = load_begin + load_lane; + auto loaded_col = + checked_load(load_source_col, load_idx, load_end, sentinel); + auto loaded_val = checked_load(load_source_val, load_idx, load_end, + zero()); + // shuffle the `a` values to the end of the warp + auto lower_loaded_col = subwarp.shfl_up(loaded_col, llt_advance); + auto lower_loaded_val = subwarp.shfl_up(loaded_val, llt_advance); + if (lane >= llt_advance) { + a_col = lower_loaded_col; + a_val = lower_loaded_val; + } + if (lane >= a_advance) { + llt_col = loaded_col; + llt_val = loaded_val; + } + // load the new values for l + if (lane >= subwarp_size - l_advance) { + auto l_idx = l_begin + lane; + l_col = checked_load(l_col_idxs, l_idx, l_end, sentinel); + l_val = checked_load(l_vals, l_idx, l_end, zero()); + } + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/factorization/par_ict_sweep_kernels.hpp.inc b/common/factorization/par_ict_sweep_kernels.hpp.inc new file mode 100644 index 00000000000..060bacb2144 --- /dev/null +++ b/common/factorization/par_ict_sweep_kernels.hpp.inc @@ -0,0 +1,103 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void ict_sweep( + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + const ValueType *__restrict__ a_vals, + const IndexType *__restrict__ l_row_ptrs, + const IndexType *__restrict__ l_row_idxs, + const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_vals, + IndexType l_nnz) +{ + auto l_nz = thread::get_subwarp_id_flat(); + if (l_nz >= l_nnz) { + return; + } + auto row = l_row_idxs[l_nz]; + auto col = l_col_idxs[l_nz]; + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto lt_col_begin = l_row_ptrs[col]; + auto lt_col_size = l_row_ptrs[col + 1] - lt_col_begin; + ValueType sum{}; + IndexType lt_nz{}; + auto last_entry = col; + group_merge( + l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lt_col_begin, + lt_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType lt_idx, + IndexType lt_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == lt_row && l_col < last_entry) { + sum += + l_vals[l_idx + l_row_begin] * l_vals[lt_idx + lt_col_begin]; + } + // remember the transposed element + auto found_transp = subwarp.ballot(lt_row == row); + if (found_transp) { + lt_nz = + subwarp.shfl(lt_idx + lt_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + auto to_write = row == col + ? sqrt(a_val - sum) + : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; + if (is_finite(to_write)) { + l_vals[l_nz] = to_write; + } + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/factorization/par_ilu_kernels.hpp.inc b/common/factorization/par_ilu_kernels.hpp.inc new file mode 100644 index 00000000000..af28012cf81 --- /dev/null +++ b/common/factorization/par_ilu_kernels.hpp.inc @@ -0,0 +1,82 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void compute_l_u_factors( + size_type num_elements, const IndexType *__restrict__ row_idxs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, + const IndexType *__restrict__ l_row_ptrs, + const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values, + const IndexType *__restrict__ u_row_ptrs, + const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values) +{ + const auto elem_id = thread::get_thread_id_flat(); + if (elem_id < num_elements) { + const auto row = row_idxs[elem_id]; + const auto col = col_idxs[elem_id]; + const auto val = values[elem_id]; + auto l_idx = l_row_ptrs[row]; + auto u_idx = u_row_ptrs[col]; + ValueType sum{val}; + ValueType last_operation{}; + while (l_idx < l_row_ptrs[row + 1] && u_idx < u_row_ptrs[col + 1]) { + const auto l_col = l_col_idxs[l_idx]; + const auto u_col = u_col_idxs[u_idx]; + last_operation = zero(); + if (l_col == u_col) { + last_operation = l_values[l_idx] * u_values[u_idx]; + sum -= last_operation; + } + l_idx += (l_col <= u_col); + u_idx += (u_col <= l_col); + } + sum += last_operation; // undo the last operation + if (row > col) { + auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1]; + if (is_finite(to_write)) { + l_values[l_idx - 1] = to_write; + } + } else { + auto to_write = sum; + if (is_finite(to_write)) { + u_values[u_idx - 1] = to_write; + } + } + } +} + + +} // namespace kernel diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc new file mode 100644 index 00000000000..43addc2504b --- /dev/null +++ b/common/factorization/par_ilut_filter_kernels.hpp.inc @@ -0,0 +1,191 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__device__ void abstract_filter_impl(const IndexType *row_ptrs, + IndexType num_rows, Predicate pred, + BeginCallback begin_cb, + StepCallback step_cb, + FinishCallback finish_cb) +{ + auto subwarp = group::thread_block_tile(); + auto row = thread::get_subwarp_id_flat(); + auto lane = subwarp.thread_rank(); + auto lane_prefix_mask = (config::lane_mask_type(1) << lane) - 1; + if (row >= num_rows) { + return; + } + + auto begin = row_ptrs[row]; + auto end = row_ptrs[row + 1]; + begin_cb(row); + auto num_steps = ceildiv(end - begin, subwarp_size); + for (auto step = 0; step < num_steps; ++step) { + auto idx = begin + lane + step * subwarp_size; + auto keep = idx < end && pred(idx, begin, end); + auto mask = subwarp.ballot(keep); + step_cb(row, idx, keep, popcnt(mask), popcnt(mask & lane_prefix_mask)); + } + finish_cb(row, lane); +} + + +template +__device__ void abstract_filter_nnz(const IndexType *__restrict__ row_ptrs, + IndexType num_rows, Predicate pred, + IndexType *__restrict__ nnz) +{ + IndexType count{}; + abstract_filter_impl( + row_ptrs, num_rows, pred, [&](IndexType) { count = 0; }, + [&](IndexType, IndexType, bool, IndexType warp_count, IndexType) { + count += warp_count; + }, + [&](IndexType row, IndexType lane) { + if (row < num_rows && lane == 0) { + nnz[row] = count; + } + }); +} + + +template +__device__ void abstract_filter(const IndexType *__restrict__ old_row_ptrs, + const IndexType *__restrict__ old_col_idxs, + const ValueType *__restrict__ old_vals, + IndexType num_rows, Predicate pred, + const IndexType *__restrict__ new_row_ptrs, + IndexType *__restrict__ new_row_idxs, + IndexType *__restrict__ new_col_idxs, + ValueType *__restrict__ new_vals) +{ + IndexType count{}; + IndexType new_offset{}; + abstract_filter_impl( + old_row_ptrs, num_rows, pred, + [&](IndexType row) { + new_offset = new_row_ptrs[row]; + count = 0; + }, + [&](IndexType row, IndexType idx, bool keep, IndexType warp_count, + IndexType warp_prefix_sum) { + if (keep) { + auto new_idx = new_offset + warp_prefix_sum + count; + if (new_row_idxs) { + new_row_idxs[new_idx] = row; + } + new_col_idxs[new_idx] = old_col_idxs[idx]; + new_vals[new_idx] = old_vals[idx]; + } + count += warp_count; + }, + [](IndexType, IndexType) {}); +} + + +template +__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz( + const IndexType *__restrict__ row_ptrs, const ValueType *vals, + IndexType num_rows, remove_complex threshold, + IndexType *__restrict__ nnz, bool lower) +{ + abstract_filter_nnz( + row_ptrs, num_rows, + [&](IndexType idx, IndexType row_begin, IndexType row_end) { + auto diag_idx = lower ? row_end - 1 : row_begin; + return abs(vals[idx]) >= threshold || idx == diag_idx; + }, + nnz); +} + + +template +__global__ __launch_bounds__(default_block_size) void threshold_filter( + const IndexType *__restrict__ old_row_ptrs, + const IndexType *__restrict__ old_col_idxs, + const ValueType *__restrict__ old_vals, IndexType num_rows, + remove_complex threshold, + const IndexType *__restrict__ new_row_ptrs, + IndexType *__restrict__ new_row_idxs, IndexType *__restrict__ new_col_idxs, + ValueType *__restrict__ new_vals, bool lower) +{ + abstract_filter( + old_row_ptrs, old_col_idxs, old_vals, num_rows, + [&](IndexType idx, IndexType row_begin, IndexType row_end) { + auto diag_idx = lower ? row_end - 1 : row_begin; + return abs(old_vals[idx]) >= threshold || idx == diag_idx; + }, + new_row_ptrs, new_row_idxs, new_col_idxs, new_vals); +} + + +template +__global__ __launch_bounds__(default_block_size) void bucket_filter_nnz( + const IndexType *__restrict__ row_ptrs, const BucketType *buckets, + IndexType num_rows, BucketType bucket, IndexType *__restrict__ nnz) +{ + abstract_filter_nnz( + row_ptrs, num_rows, + [&](IndexType idx, IndexType row_begin, IndexType row_end) { + return buckets[idx] >= bucket || idx == row_end - 1; + }, + nnz); +} + + +template +__global__ __launch_bounds__(default_block_size) void bucket_filter( + const IndexType *__restrict__ old_row_ptrs, + const IndexType *__restrict__ old_col_idxs, + const ValueType *__restrict__ old_vals, const BucketType *buckets, + IndexType num_rows, BucketType bucket, + const IndexType *__restrict__ new_row_ptrs, + IndexType *__restrict__ new_row_idxs, IndexType *__restrict__ new_col_idxs, + ValueType *__restrict__ new_vals) +{ + abstract_filter( + old_row_ptrs, old_col_idxs, old_vals, num_rows, + [&](IndexType idx, IndexType row_begin, IndexType row_end) { + return buckets[idx] >= bucket || idx == row_end - 1; + }, + new_row_ptrs, new_row_idxs, new_col_idxs, new_vals); +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc new file mode 100644 index 00000000000..a7a6b5a01f7 --- /dev/null +++ b/common/factorization/par_ilut_select_kernels.hpp.inc @@ -0,0 +1,308 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +constexpr auto searchtree_width = 1 << sampleselect_searchtree_height; +constexpr auto searchtree_inner_size = searchtree_width - 1; +constexpr auto searchtree_size = searchtree_width + searchtree_inner_size; + +constexpr auto sample_size = searchtree_width * sampleselect_oversampling; + +constexpr auto basecase_size = 1024; +constexpr auto basecase_local_size = 4; +constexpr auto basecase_block_size = basecase_size / basecase_local_size; + + +// must be launched with one thread block and block size == searchtree_width +/** + * @internal + * + * Samples `searchtree_width - 1` uniformly distributed elements + * and stores them in a binary search tree as splitters. + */ +template +__global__ __launch_bounds__(searchtree_width) void build_searchtree( + const ValueType *__restrict__ input, IndexType size, + remove_complex *__restrict__ tree_output) +{ + using AbsType = remove_complex; + auto idx = threadIdx.x; + AbsType samples[sampleselect_oversampling]; + // assuming rounding towards zero + auto stride = double(size) / sample_size; +#pragma unroll + for (auto i = 0; i < sampleselect_oversampling; ++i) { + auto lidx = idx * sampleselect_oversampling + i; + auto val = input[static_cast(lidx * stride)]; + samples[i] = abs(val); + } + __shared__ AbsType sh_samples[sample_size]; + bitonic_sort(samples, sh_samples); + if (idx > 0) { + // root has level 0 + auto level = sampleselect_searchtree_height - ffs(threadIdx.x); + // we get the in-level index by removing trailing 10000... + auto idx_in_level = threadIdx.x >> ffs(threadIdx.x); + // we get the global index by adding previous levels + auto previous_levels = (1 << level) - 1; + tree_output[idx_in_level + previous_levels] = samples[0]; + } + tree_output[threadIdx.x + searchtree_inner_size] = samples[0]; +} + + +// must be launched with default_block_size >= searchtree_width +/** + * @internal + * + * Computes the number of elements in each of the buckets defined + * by the splitter search tree. Stores the thread-block local + * results packed by bucket idx. + */ +template +__global__ __launch_bounds__(default_block_size) void count_buckets( + const ValueType *__restrict__ input, IndexType size, + const remove_complex *__restrict__ tree, IndexType *counter, + unsigned char *oracles, int items_per_thread) +{ + // load tree into shared memory, initialize counters + __shared__ remove_complex sh_tree[searchtree_inner_size]; + __shared__ IndexType sh_counter[searchtree_width]; + if (threadIdx.x < searchtree_inner_size) { + sh_tree[threadIdx.x] = tree[threadIdx.x]; + } + if (threadIdx.x < searchtree_width) { + sh_counter[threadIdx.x] = 0; + } + group::this_thread_block().sync(); + + // work distribution: each thread block gets a consecutive index range + auto begin = threadIdx.x + default_block_size * + static_cast(blockIdx.x) * + items_per_thread; + auto block_end = default_block_size * + static_cast(blockIdx.x + 1) * items_per_thread; + auto end = min(block_end, size); + for (IndexType i = begin; i < end; i += default_block_size) { + // traverse the search tree with the input element + auto el = abs(input[i]); + IndexType tree_idx{}; +#pragma unroll + for (auto level = 0; level < sampleselect_searchtree_height; ++level) { + auto cmp = !(el < sh_tree[tree_idx]); + tree_idx = 2 * tree_idx + 1 + cmp; + } + // increment the bucket counter and store the bucket index + uint32 bucket = tree_idx - searchtree_inner_size; + // post-condition: sample[bucket] <= el < sample[bucket + 1] + atomic_add(sh_counter + bucket, 1); + oracles[i] = bucket; + } + group::this_thread_block().sync(); + + // write back the block-wide counts to global memory + if (threadIdx.x < searchtree_width) { + counter[blockIdx.x + threadIdx.x * gridDim.x] = sh_counter[threadIdx.x]; + } +} + + +// must be launched with default_block_size threads per block +/** + * @internal + * + * Simultaneously computes a prefix and total sum of the block-local counts for + * each bucket. The results are then used as base offsets for the following + * filter step. + */ +template +__global__ __launch_bounds__(default_block_size) void block_prefix_sum( + IndexType *__restrict__ counters, IndexType *__restrict__ totals, + IndexType num_blocks) +{ + constexpr auto num_warps = default_block_size / config::warp_size; + static_assert(num_warps < config::warp_size, + "block size needs to be smaller"); + __shared__ IndexType warp_sums[num_warps]; + + auto block = group::this_thread_block(); + auto warp = group::tiled_partition(block); + + auto bucket = blockIdx.x; + auto local_counters = counters + num_blocks * bucket; + auto work_per_warp = ceildiv(num_blocks, warp.size()); + auto warp_idx = threadIdx.x / warp.size(); + auto warp_lane = warp.thread_rank(); + + // compute prefix sum over warp-sized blocks + IndexType total{}; + auto base_idx = warp_idx * work_per_warp * warp.size(); + for (auto step = 0; step < work_per_warp; ++step) { + auto idx = warp_lane + step * warp.size() + base_idx; + auto val = idx < num_blocks ? local_counters[idx] : zero(); + IndexType warp_total{}; + IndexType warp_prefix{}; + // compute inclusive prefix sum + subwarp_prefix_sum(val, warp_prefix, warp_total, warp); + + if (idx < num_blocks) { + local_counters[idx] = warp_prefix + total; + } + total += warp_total; + } + + // store total sum + if (warp_lane == 0) { + warp_sums[warp_idx] = total; + } + + // compute prefix sum over all warps in a single warp + block.sync(); + if (warp_idx == 0) { + auto in_bounds = warp_lane < num_warps; + auto val = in_bounds ? warp_sums[warp_lane] : zero(); + IndexType prefix_sum{}; + IndexType total_sum{}; + // compute inclusive prefix sum + subwarp_prefix_sum(val, prefix_sum, total_sum, warp); + if (in_bounds) { + warp_sums[warp_lane] = prefix_sum; + } + if (warp_lane == 0) { + totals[bucket] = total_sum; + } + } + + // add block prefix sum to each warp's block of data + block.sync(); + auto warp_prefixsum = warp_sums[warp_idx]; + for (auto step = 0; step < work_per_warp; ++step) { + auto idx = warp_lane + step * warp.size() + base_idx; + auto val = idx < num_blocks ? local_counters[idx] : zero(); + if (idx < num_blocks) { + local_counters[idx] += warp_prefixsum; + } + } +} + + +// must be launched with default_block_size >= searchtree_width +/** + * @internal + * + * This copies all elements from a single bucket of the input to the output. + */ +template +__global__ __launch_bounds__(default_block_size) void filter_bucket( + const ValueType *__restrict__ input, IndexType size, unsigned char bucket, + const unsigned char *oracles, const IndexType *block_offsets, + remove_complex *__restrict__ output, int items_per_thread) +{ + // initialize the counter with the block prefix sum. + __shared__ IndexType counter; + if (threadIdx.x == 0) { + counter = block_offsets[blockIdx.x + bucket * gridDim.x]; + } + group::this_thread_block().sync(); + + // same work-distribution as in count_buckets + auto begin = threadIdx.x + default_block_size * + static_cast(blockIdx.x) * + items_per_thread; + auto block_end = default_block_size * + static_cast(blockIdx.x + 1) * items_per_thread; + auto end = min(block_end, size); + for (IndexType i = begin; i < end; i += default_block_size) { + // only copy the element when it belongs to the target bucket + auto found = bucket == oracles[i]; + auto ofs = atomic_add(&counter, found); + if (found) { + output[ofs] = abs(input[i]); + } + } +} + + +/** + * @internal + * + * Selects the `rank`th smallest element from a small array by sorting it. + */ +template +__global__ __launch_bounds__(basecase_block_size) void basecase_select( + const ValueType *__restrict__ input, IndexType size, IndexType rank, + ValueType *__restrict__ out) +{ + constexpr auto sentinel = device_numeric_limits::inf; + ValueType local[basecase_local_size]; + __shared__ ValueType sh_local[basecase_size]; + for (int i = 0; i < basecase_local_size; ++i) { + auto idx = threadIdx.x + i * basecase_block_size; + local[i] = idx < size ? input[idx] : sentinel; + } + bitonic_sort(local, sh_local); + if (threadIdx.x == rank / basecase_local_size) { + *out = local[rank % basecase_local_size]; + } +} + + +/** + * @internal + * + * Finds the bucket that contains the element with the given rank + * and stores it and the bucket's base rank and size in the place of the prefix + * sum. + */ +template +__global__ __launch_bounds__(config::warp_size) void find_bucket( + IndexType *prefix_sum, IndexType rank) +{ + auto warp = + group::tiled_partition(group::this_thread_block()); + auto idx = group_wide_search(0, searchtree_width, warp, [&](int i) { + return prefix_sum[i + 1] > rank; + }); + if (warp.thread_rank() == 0) { + auto base = prefix_sum[idx]; + auto size = prefix_sum[idx + 1] - base; + // don't overwrite anything before having loaded everything! + prefix_sum[0] = idx; + prefix_sum[1] = base; + prefix_sum[2] = size; + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/factorization/par_ilut_spgeam_kernels.hpp.inc new file mode 100644 index 00000000000..903968bf4a6 --- /dev/null +++ b/common/factorization/par_ilut_spgeam_kernels.hpp.inc @@ -0,0 +1,276 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void tri_spgeam_nnz( + const IndexType *__restrict__ lu_row_ptrs, + const IndexType *__restrict__ lu_col_idxs, + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + IndexType *__restrict__ l_new_row_ptrs, + IndexType *__restrict__ u_new_row_ptrs, IndexType num_rows) +{ + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + + auto lu_begin = lu_row_ptrs[row]; + auto lu_size = lu_row_ptrs[row + 1] - lu_begin; + auto a_begin = a_row_ptrs[row]; + auto a_size = a_row_ptrs[row + 1] - a_begin; + IndexType l_count{}; + IndexType u_count{}; + group_merge( + a_col_idxs + a_begin, a_size, lu_col_idxs + lu_begin, lu_size, subwarp, + [&](IndexType a_nz, IndexType a_col, IndexType lu_nz, IndexType lu_col, + IndexType out_nz, bool valid) { + auto col = min(a_col, lu_col); + // count the number of unique elements being merged + l_count += + popcnt(subwarp.ballot(col <= row && a_col != lu_col && valid)); + u_count += + popcnt(subwarp.ballot(col >= row && a_col != lu_col && valid)); + return true; + }); + if (subwarp.thread_rank() == 0) { + l_new_row_ptrs[row] = l_count; + u_new_row_ptrs[row] = u_count; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void tri_spgeam_init( + const IndexType *__restrict__ lu_row_ptrs, + const IndexType *__restrict__ lu_col_idxs, + const ValueType *__restrict__ lu_vals, + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + const ValueType *__restrict__ a_vals, + const IndexType *__restrict__ l_row_ptrs, + const IndexType *__restrict__ l_col_idxs, + const ValueType *__restrict__ l_vals, + const IndexType *__restrict__ u_row_ptrs, + const IndexType *__restrict__ u_col_idxs, + const ValueType *__restrict__ u_vals, + const IndexType *__restrict__ l_new_row_ptrs, + IndexType *__restrict__ l_new_col_idxs, ValueType *__restrict__ l_new_vals, + const IndexType *__restrict__ u_new_row_ptrs, + IndexType *__restrict__ u_new_col_idxs, ValueType *__restrict__ u_new_vals, + IndexType num_rows) +{ + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + + auto lane = static_cast(subwarp.thread_rank()); + auto lanemask_eq = config::lane_mask_type{1} << lane; + auto lanemask_lt = lanemask_eq - 1; + + // merge A, L*U (and L+U) + auto l_begin = l_row_ptrs[row]; + auto l_end = l_row_ptrs[row + 1] - 1; // ignore diagonal + auto l_size = l_end - l_begin; + + auto u_begin = u_row_ptrs[row]; + auto u_end = u_row_ptrs[row + 1]; + auto u_size = u_end - u_begin; + + // lpu_* stores the entries of L + U with the diagonal from U + // this allows us to act as if L and U were a single matrix + auto lpu_begin = l_begin; + auto lpu_end = lpu_begin + l_size + u_size; + auto lpu_col_idxs = + lpu_begin + lane < l_end ? l_col_idxs : u_col_idxs + u_begin - l_end; + auto lpu_vals = + lpu_begin + lane < l_end ? l_vals : u_vals + u_begin - l_end; + + auto lu_begin = lu_row_ptrs[row]; + auto lu_end = lu_row_ptrs[row + 1]; + auto lu_size = lu_end - lu_begin; + + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto a_size = a_end - a_begin; + + IndexType out_begin{}; + auto out_size = lu_size + a_size; + + IndexType l_new_begin = l_new_row_ptrs[row]; + IndexType u_new_begin = u_new_row_ptrs[row]; + + constexpr auto sentinel = device_numeric_limits::max; + // load column indices and values for the first merge step + auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel); + auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero()); + auto lu_col = checked_load(lu_col_idxs, lu_begin + lane, lu_end, sentinel); + auto lu_val = + checked_load(lu_vals, lu_begin + lane, lu_end, zero()); + auto lpu_col = + checked_load(lpu_col_idxs, lpu_begin + lane, lpu_end, sentinel); + auto lpu_val = + checked_load(lpu_vals, lpu_begin + lane, lpu_end, zero()); + bool skip_first{}; + while (out_begin < out_size) { + // merge subwarp.size() elements from A and L*U + auto merge_result = + group_merge_step(a_col, lu_col, subwarp); + auto a_cur_col = merge_result.a_val; + auto lu_cur_col = merge_result.b_val; + auto a_cur_val = subwarp.shfl(a_val, merge_result.a_idx); + auto lu_cur_val = subwarp.shfl(lu_val, merge_result.b_idx); + auto valid = out_begin + lane < out_size; + // check if the previous thread has matching columns + auto equal_mask = subwarp.ballot(a_cur_col == lu_cur_col && valid); + auto prev_equal_mask = equal_mask << 1 | skip_first; + skip_first = bool(equal_mask >> (subwarp_size - 1)); + auto prev_equal = bool(prev_equal_mask & lanemask_eq); + + auto r_col = min(a_cur_col, lu_cur_col); + // find matching entry of L+U + // S(L + U) is a subset of S(A - L * U) since L and U have a diagonal + auto lpu_source = synchronous_fixed_binary_search( + [&](int i) { return subwarp.shfl(lpu_col, i) >= r_col; }); + auto lpu_cur_col = subwarp.shfl(lpu_col, lpu_source); + auto lpu_cur_val = subwarp.shfl(lpu_val, lpu_source); + + // determine actual values of A and L*U at r_col + if (r_col != a_cur_col) { + a_cur_val = zero(); + } + if (r_col != lu_cur_col) { + lu_cur_val = zero(); + } + auto r_val = a_cur_val - lu_cur_val; + + // determine which threads will write output to L or U + auto use_lpu = lpu_cur_col == r_col; + auto l_new_advance_mask = + subwarp.ballot(r_col <= row && !prev_equal && valid); + auto u_new_advance_mask = + subwarp.ballot(r_col >= row && !prev_equal && valid); + // store values + if (!prev_equal && valid) { + auto diag = + r_col < row ? u_vals[u_row_ptrs[r_col]] : one(); + auto out_val = use_lpu ? lpu_cur_val : r_val / diag; + if (r_col <= row) { + auto ofs = popcnt(l_new_advance_mask & lanemask_lt); + l_new_col_idxs[l_new_begin + ofs] = r_col; + l_new_vals[l_new_begin + ofs] = + r_col == row ? one() : out_val; + } + if (r_col >= row) { + auto ofs = popcnt(u_new_advance_mask & lanemask_lt); + u_new_col_idxs[u_new_begin + ofs] = r_col; + u_new_vals[u_new_begin + ofs] = out_val; + } + } + + // advance *_begin offsets + auto a_advance = merge_result.a_advance; + auto lu_advance = merge_result.b_advance; + auto lpu_advance = + popcnt(subwarp.ballot(use_lpu && !prev_equal && valid)); + auto l_new_advance = popcnt(l_new_advance_mask); + auto u_new_advance = popcnt(u_new_advance_mask); + a_begin += a_advance; + lu_begin += lu_advance; + lpu_begin += lpu_advance; + l_new_begin += l_new_advance; + u_new_begin += u_new_advance; + out_begin += subwarp_size; + + // shuffle the unmerged elements to the front + a_col = subwarp.shfl_down(a_col, a_advance); + a_val = subwarp.shfl_down(a_val, a_advance); + lu_col = subwarp.shfl_down(lu_col, lu_advance); + lu_val = subwarp.shfl_down(lu_val, lu_advance); + lpu_col = subwarp.shfl_down(lpu_col, lpu_advance); + lpu_val = subwarp.shfl_down(lpu_val, lpu_advance); + /* + * To optimize memory access, we load the new elements for `a` and `lu` + * with a single load instruction: + * the lower part of the group loads new elements for `a` + * the upper part of the group loads new elements for `lu` + * `load_lane` is the part-local lane idx + * The elements for `a` have to be shuffled up afterwards. + */ + auto load_a = lane < a_advance; + auto load_lane = load_a ? lane : lane - a_advance; + auto load_source_col = load_a ? a_col_idxs : lu_col_idxs; + auto load_source_val = load_a ? a_vals : lu_vals; + auto load_begin = load_a ? a_begin + lu_advance : lu_begin + a_advance; + auto load_end = load_a ? a_end : lu_end; + + auto load_idx = load_begin + load_lane; + auto loaded_col = + checked_load(load_source_col, load_idx, load_end, sentinel); + auto loaded_val = checked_load(load_source_val, load_idx, load_end, + zero()); + // shuffle the `a` values to the end of the warp + auto lower_loaded_col = subwarp.shfl_up(loaded_col, lu_advance); + auto lower_loaded_val = subwarp.shfl_up(loaded_val, lu_advance); + if (lane >= lu_advance) { + a_col = lower_loaded_col; + a_val = lower_loaded_val; + } + if (lane >= a_advance) { + lu_col = loaded_col; + lu_val = loaded_val; + } + // load the new values for lpu + if (lane >= subwarp_size - lpu_advance) { + auto lpu_idx = lpu_begin + lane; + // update lpu pointer if we move from l to u + if (lpu_idx >= l_end) { + lpu_col_idxs = u_col_idxs + u_begin - l_end; + lpu_vals = u_vals + u_begin - l_end; + } + lpu_col = checked_load(lpu_col_idxs, lpu_idx, lpu_end, sentinel); + lpu_val = + checked_load(lpu_vals, lpu_idx, lpu_end, zero()); + } + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/factorization/par_ilut_sweep_kernels.hpp.inc b/common/factorization/par_ilut_sweep_kernels.hpp.inc new file mode 100644 index 00000000000..96cfc951b64 --- /dev/null +++ b/common/factorization/par_ilut_sweep_kernels.hpp.inc @@ -0,0 +1,121 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void sweep( + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + const ValueType *__restrict__ a_vals, + const IndexType *__restrict__ l_row_ptrs, + const IndexType *__restrict__ l_row_idxs, + const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_vals, + IndexType l_nnz, const IndexType *__restrict__ u_row_idxs, + const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_vals, + const IndexType *__restrict__ ut_col_ptrs, + const IndexType *__restrict__ ut_row_idxs, ValueType *__restrict__ ut_vals, + IndexType u_nnz) +{ + auto tidx = thread::get_subwarp_id_flat(); + if (tidx >= l_nnz + u_nnz) { + return; + } + // split the subwarps into two halves for lower and upper triangle + auto l_nz = tidx; + auto u_nz = l_nz - l_nnz; + auto lower = u_nz < 0; + auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; + auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; + if (lower && row == col) { + // don't update the diagonal twice + return; + } + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto ut_col_begin = ut_col_ptrs[col]; + auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; + ValueType sum{}; + IndexType ut_nz{}; + auto last_entry = min(row, col); + group_merge( + l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, + ut_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, + IndexType ut_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == ut_row && l_col < last_entry) { + sum += l_vals[l_idx + l_row_begin] * + ut_vals[ut_idx + ut_col_begin]; + } + // remember the transposed element + auto found_transp = subwarp.ballot(ut_row == row); + if (found_transp) { + ut_nz = + subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + if (lower) { + auto to_write = (a_val - sum) / ut_vals[ut_col_ptrs[col + 1] - 1]; + if (is_finite(to_write)) { + l_vals[l_nz] = to_write; + } + } else { + auto to_write = a_val - sum; + if (is_finite(to_write)) { + u_vals[u_nz] = to_write; + ut_vals[ut_nz] = to_write; + } + } + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/matrix/coo_kernels.hpp.inc b/common/matrix/coo_kernels.hpp.inc new file mode 100644 index 00000000000..b4dad369eb2 --- /dev/null +++ b/common/matrix/coo_kernels.hpp.inc @@ -0,0 +1,275 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +namespace { + + +/** + * The device function of COO spmv + * + * @param nnz the number of nonzeros in the matrix + * @param num_lines the maximum round of each warp + * @param val the value array of the matrix + * @param col the column index array of the matrix + * @param row the row index array of the matrix + * @param b the input dense vector + * @param b_stride the stride of the input dense vector + * @param c the output dense vector + * @param c_stride the stride of the output dense vector + * @param scale the function on the added value + * + * @tparam ValueType type of values stored in the matrix + * @tparam IndexType type of matrix indexes stored in the structure + * @tparam Closure type of the function used to write the result + */ +template +__device__ void spmv_kernel(const size_type nnz, const size_type num_lines, + const ValueType *__restrict__ val, + const IndexType *__restrict__ col, + const IndexType *__restrict__ row, + const ValueType *__restrict__ b, + const size_type b_stride, ValueType *__restrict__ c, + const size_type c_stride, Closure scale) +{ + ValueType temp_val = zero(); + const auto start = static_cast(blockDim.x) * blockIdx.x * + blockDim.y * num_lines + + threadIdx.y * blockDim.x * num_lines; + const auto column_id = blockIdx.y; + size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size); + num = min(num, num_lines); + const IndexType ind_start = start + threadIdx.x; + const IndexType ind_end = ind_start + (num - 1) * subwarp_size; + IndexType ind = ind_start; + IndexType curr_row = (ind < nnz) ? row[ind] : 0; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + for (; ind < ind_end; ind += subwarp_size) { + temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] + : zero(); + auto next_row = + (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; + // segmented scan + if (tile_block.any(curr_row != next_row)) { + bool is_first_in_segment = + segment_scan(tile_block, curr_row, &temp_val); + if (is_first_in_segment) { + atomic_add(&(c[curr_row * c_stride + column_id]), + scale(temp_val)); + } + temp_val = zero(); + } + curr_row = next_row; + } + if (num > 0) { + ind = ind_end; + temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] + : zero(); + // segmented scan + bool is_first_in_segment = + segment_scan(tile_block, curr_row, &temp_val); + if (is_first_in_segment) { + atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); + } + } +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( + const size_type nnz, const size_type num_lines, + const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const IndexType *__restrict__ row, const ValueType *__restrict__ b, + const size_type b_stride, ValueType *__restrict__ c, + const size_type c_stride) +{ + spmv_kernel(nnz, num_lines, val, col, row, b, b_stride, c, c_stride, + [](const ValueType &x) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( + const size_type nnz, const size_type num_lines, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, + const IndexType *__restrict__ col, const IndexType *__restrict__ row, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride) +{ + ValueType scale_factor = alpha[0]; + spmv_kernel( + nnz, num_lines, val, col, row, b, b_stride, c, c_stride, + [&scale_factor](const ValueType &x) { return scale_factor * x; }); +} + + +/** + * The device function of COO spmm + * + * @param nnz the number of nonzeros in the matrix + * @param num_elems the maximum number of nonzeros in each warp + * @param val the value array of the matrix + * @param col the column index array of the matrix + * @param row the row index array of the matrix + * @param num_cols the number of columns of the matrix + * @param b the input dense vector + * @param b_stride the stride of the input dense vector + * @param c the output dense vector + * @param c_stride the stride of the output dense vector + * @param scale the function on the added value + * + * @tparam ValueType type of values stored in the matrix + * @tparam IndexType type of matrix indexes stored in the structure + * @tparam Closure type of the function used to write the result + */ +template +__device__ void spmm_kernel(const size_type nnz, const size_type num_elems, + const ValueType *__restrict__ val, + const IndexType *__restrict__ col, + const IndexType *__restrict__ row, + const size_type num_cols, + const ValueType *__restrict__ b, + const size_type b_stride, ValueType *__restrict__ c, + const size_type c_stride, Closure scale) +{ + ValueType temp = zero(); + const auto coo_idx = + (static_cast(blockDim.y) * blockIdx.x + threadIdx.y) * + num_elems; + const auto column_id = blockIdx.y * blockDim.x + threadIdx.x; + const auto coo_end = + (coo_idx + num_elems > nnz) ? nnz : coo_idx + num_elems; + if (column_id < num_cols && coo_idx < nnz) { + auto curr_row = row[coo_idx]; + auto idx = coo_idx; + for (; idx < coo_end - 1; idx++) { + temp += val[idx] * b[col[idx] * b_stride + column_id]; + const auto next_row = row[idx + 1]; + if (next_row != curr_row) { + atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp)); + curr_row = next_row; + temp = zero(); + } + } + temp += val[idx] * b[col[idx] * b_stride + column_id]; + atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp)); + } +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_spmm( + const size_type nnz, const size_type num_elems, + const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const IndexType *__restrict__ row, const size_type num_cols, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride) +{ + spmm_kernel(nnz, num_elems, val, col, row, num_cols, b, b_stride, c, + c_stride, [](const ValueType &x) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_spmm( + const size_type nnz, const size_type num_elems, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, + const IndexType *__restrict__ col, const IndexType *__restrict__ row, + const size_type num_cols, const ValueType *__restrict__ b, + const size_type b_stride, ValueType *__restrict__ c, + const size_type c_stride) +{ + ValueType scale_factor = alpha[0]; + spmm_kernel( + nnz, num_elems, val, col, row, num_cols, b, b_stride, c, c_stride, + [&scale_factor](const ValueType &x) { return scale_factor * x; }); +} + + +} // namespace + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs( + const IndexType *__restrict__ idxs, size_type num_nonzeros, + IndexType *__restrict__ ptrs, size_type length) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx == 0) { + ptrs[0] = 0; + ptrs[length - 1] = num_nonzeros; + } + + if (0 < tidx && tidx < num_nonzeros) { + if (idxs[tidx - 1] < idxs[tidx]) { + for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) { + ptrs[i] = tidx; + } + } + } +} + + +template +__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ result) +{ + const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; + const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; + if (tidx_x < num_cols && tidx_y < num_rows) { + result[tidx_y * stride + tidx_x] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_dense( + size_type nnz, const IndexType *__restrict__ row_idxs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, size_type stride, + ValueType *__restrict__ result) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < nnz) { + result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx]; + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/matrix/csr_kernels.hpp.inc b/common/matrix/csr_kernels.hpp.inc new file mode 100644 index 00000000000..0ee4c34dad6 --- /dev/null +++ b/common/matrix/csr_kernels.hpp.inc @@ -0,0 +1,922 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__host__ __device__ __forceinline__ T ceildivT(T nom, T denom) +{ + return (nom + denom - 1ll) / denom; +} + + +template +__device__ __forceinline__ bool block_segment_scan_reverse( + const IndexType *__restrict__ ind, ValueType *__restrict__ val) +{ + bool last = true; + const auto reg_ind = ind[threadIdx.x]; +#pragma unroll + for (int i = 1; i < spmv_block_size; i <<= 1) { + if (i == 1 && threadIdx.x < spmv_block_size - 1 && + reg_ind == ind[threadIdx.x + 1]) { + last = false; + } + auto temp = zero(); + if (threadIdx.x >= i && reg_ind == ind[threadIdx.x - i]) { + temp = val[threadIdx.x - i]; + } + group::this_thread_block().sync(); + val[threadIdx.x] += temp; + group::this_thread_block().sync(); + } + + return last; +} + + +template +__device__ __forceinline__ void find_next_row( + const IndexType num_rows, const IndexType data_size, const IndexType ind, + IndexType *__restrict__ row, IndexType *__restrict__ row_end, + const IndexType row_predict, const IndexType row_predict_end, + const IndexType *__restrict__ row_ptr) +{ + if (!overflow || ind < data_size) { + if (ind >= *row_end) { + *row = row_predict; + *row_end = row_predict_end; + for (; ind >= *row_end; *row_end = row_ptr[++*row + 1]) + ; + } + + } else { + *row = num_rows - 1; + *row_end = data_size; + } +} + + +template +__device__ __forceinline__ void warp_atomic_add( + const group::thread_block_tile &group, bool force_write, + ValueType *__restrict__ val, const IndexType row, ValueType *__restrict__ c, + const size_type c_stride, const IndexType column_id, Closure scale) +{ + // do a local scan to avoid atomic collisions + const bool need_write = segment_scan(group, row, val); + if (need_write && force_write) { + atomic_add(&(c[row * c_stride + column_id]), scale(*val)); + } + if (!need_write || force_write) { + *val = zero(); + } +} + + +template +__device__ __forceinline__ void process_window( + const group::thread_block_tile &group, + const IndexType num_rows, const IndexType data_size, const IndexType ind, + IndexType *__restrict__ row, IndexType *__restrict__ row_end, + IndexType *__restrict__ nrow, IndexType *__restrict__ nrow_end, + ValueType *__restrict__ temp_val, const ValueType *__restrict__ val, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b, + const size_type b_stride, ValueType *__restrict__ c, + const size_type c_stride, const IndexType column_id, Closure scale) +{ + const IndexType curr_row = *row; + find_next_row(num_rows, data_size, ind, row, row_end, *nrow, + *nrow_end, row_ptrs); + // segmented scan + if (group.any(curr_row != *row)) { + warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c, + c_stride, column_id, scale); + *nrow = group.shfl(*row, subwarp_size - 1); + *nrow_end = group.shfl(*row_end, subwarp_size - 1); + } + + if (!last || ind < data_size) { + const auto col = col_idxs[ind]; + *temp_val += val[ind] * b[col * b_stride + column_id]; + } +} + + +template +__device__ __forceinline__ IndexType get_warp_start_idx( + const IndexType nwarps, const IndexType nnz, const IndexType warp_idx) +{ + const long long cache_lines = ceildivT(nnz, wsize); + return (warp_idx * cache_lines / nwarps) * wsize; +} + + +template +__device__ __forceinline__ void spmv_kernel( + const IndexType nwarps, const IndexType num_rows, + const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride, Closure scale) +{ + const IndexType warp_idx = blockIdx.x * warps_in_block + threadIdx.y; + const IndexType column_id = blockIdx.y; + if (warp_idx >= nwarps) { + return; + } + const IndexType data_size = row_ptrs[num_rows]; + const IndexType start = get_warp_start_idx(nwarps, data_size, warp_idx); + const IndexType end = + min(get_warp_start_idx(nwarps, data_size, warp_idx + 1), + ceildivT(data_size, wsize) * wsize); + auto row = srow[warp_idx]; + auto row_end = row_ptrs[row + 1]; + auto nrow = row; + auto nrow_end = row_end; + ValueType temp_val = zero(); + IndexType ind = start + threadIdx.x; + find_next_row(num_rows, data_size, ind, &row, &row_end, nrow, + nrow_end, row_ptrs); + const IndexType ind_end = end - wsize; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + for (; ind < ind_end; ind += wsize) { + process_window(tile_block, num_rows, data_size, ind, &row, + &row_end, &nrow, &nrow_end, &temp_val, val, + col_idxs, row_ptrs, b, b_stride, c, c_stride, + column_id, scale); + } + process_window(tile_block, num_rows, data_size, ind, &row, &row_end, + &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs, + b, b_stride, c, c_stride, column_id, scale); + warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id, + scale); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( + const IndexType nwarps, const IndexType num_rows, + const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride) +{ + spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, + c_stride, [](const ValueType &x) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( + const IndexType nwarps, const IndexType num_rows, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride) +{ + ValueType scale_factor = alpha[0]; + spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, + c_stride, [&scale_factor](const ValueType &x) { + return scale_factor * x; + }); +} + + +template +__global__ __launch_bounds__(default_block_size) void set_zero( + const size_type nnz, ValueType *__restrict__ val) +{ + const auto ind = thread::get_thread_id_flat(); + if (ind < nnz) { + val[ind] = zero(); + } +} + + +template +__forceinline__ __device__ void merge_path_search( + const IndexType diagonal, const IndexType a_len, const IndexType b_len, + const IndexType *__restrict__ a, const IndexType offset_b, + IndexType *__restrict__ x, IndexType *__restrict__ y) +{ + auto x_min = max(diagonal - b_len, zero()); + auto x_max = min(diagonal, a_len); + while (x_min < x_max) { + auto pivot = x_min + (x_max - x_min) / 2; + if (a[pivot] <= offset_b + diagonal - pivot - 1) { + x_min = pivot + 1; + } else { + x_max = pivot; + } + } + + *x = min(x_min, a_len); + *y = diagonal - x_min; +} + + +template +__device__ void merge_path_reduce(const IndexType nwarps, + const ValueType *__restrict__ last_val, + const IndexType *__restrict__ last_row, + ValueType *__restrict__ c, + const size_type c_stride, Alpha_op alpha_op) +{ + const IndexType cache_lines = ceildivT(nwarps, spmv_block_size); + const IndexType tid = threadIdx.x; + const IndexType start = min(tid * cache_lines, nwarps); + const IndexType end = min((tid + 1) * cache_lines, nwarps); + ValueType value = zero(); + IndexType row = last_row[nwarps - 1]; + if (start < nwarps) { + value = last_val[start]; + row = last_row[start]; + for (IndexType i = start + 1; i < end; i++) { + if (last_row[i] != row) { + c[row * c_stride] += alpha_op(value); + row = last_row[i]; + value = last_val[i]; + } else { + value += last_val[i]; + } + } + } + __shared__ UninitializedArray tmp_ind; + __shared__ UninitializedArray tmp_val; + tmp_val[threadIdx.x] = value; + tmp_ind[threadIdx.x] = row; + group::this_thread_block().sync(); + bool last = block_segment_scan_reverse(static_cast(tmp_ind), + static_cast(tmp_val)); + group::this_thread_block().sync(); + if (last) { + c[row * c_stride] += alpha_op(tmp_val[threadIdx.x]); + } +} + + +template +__device__ void merge_path_spmv( + const IndexType num_rows, const ValueType *__restrict__ val, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride, + IndexType *__restrict__ row_out, ValueType *__restrict__ val_out, + Alpha_op alpha_op, Beta_op beta_op) +{ + const auto *row_end_ptrs = row_ptrs + 1; + const auto nnz = row_ptrs[num_rows]; + const IndexType num_merge_items = num_rows + nnz; + const auto block_items = spmv_block_size * items_per_thread; + __shared__ IndexType shared_row_ptrs[block_items]; + const IndexType diagonal = + min(IndexType(block_items * blockIdx.x), num_merge_items); + const IndexType diagonal_end = min(diagonal + block_items, num_merge_items); + IndexType block_start_x; + IndexType block_start_y; + IndexType end_x; + IndexType end_y; + merge_path_search(diagonal, num_rows, nnz, row_end_ptrs, zero(), + &block_start_x, &block_start_y); + merge_path_search(diagonal_end, num_rows, nnz, row_end_ptrs, + zero(), &end_x, &end_y); + const IndexType block_num_rows = end_x - block_start_x; + const IndexType block_num_nonzeros = end_y - block_start_y; + for (int i = threadIdx.x; + i < block_num_rows && block_start_x + i < num_rows; + i += spmv_block_size) { + shared_row_ptrs[i] = row_end_ptrs[block_start_x + i]; + } + group::this_thread_block().sync(); + + IndexType start_x; + IndexType start_y; + merge_path_search(IndexType(items_per_thread * threadIdx.x), block_num_rows, + block_num_nonzeros, shared_row_ptrs, block_start_y, + &start_x, &start_y); + + + IndexType ind = block_start_y + start_y; + IndexType row_i = block_start_x + start_x; + ValueType value = zero(); +#pragma unroll + for (IndexType i = 0; i < items_per_thread; i++) { + if (row_i < num_rows) { + if (start_x == block_num_rows || ind < shared_row_ptrs[start_x]) { + value += val[ind] * b[col_idxs[ind] * b_stride]; + ind++; + } else { + c[row_i * c_stride] = + alpha_op(value) + beta_op(c[row_i * c_stride]); + start_x++; + row_i++; + value = zero(); + } + } + } + group::this_thread_block().sync(); + IndexType *tmp_ind = shared_row_ptrs; + ValueType *tmp_val = + reinterpret_cast(shared_row_ptrs + spmv_block_size); + tmp_val[threadIdx.x] = value; + tmp_ind[threadIdx.x] = row_i; + group::this_thread_block().sync(); + bool last = block_segment_scan_reverse(static_cast(tmp_ind), + static_cast(tmp_val)); + if (threadIdx.x == spmv_block_size - 1) { + row_out[blockIdx.x] = min(end_x, num_rows - 1); + val_out[blockIdx.x] = tmp_val[threadIdx.x]; + } else if (last) { + c[row_i * c_stride] += alpha_op(tmp_val[threadIdx.x]); + } +} + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv( + const IndexType num_rows, const ValueType *__restrict__ val, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride, + IndexType *__restrict__ row_out, ValueType *__restrict__ val_out) +{ + merge_path_spmv( + num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride, + row_out, val_out, [](ValueType &x) { return x; }, + [](ValueType &x) { return zero(); }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv( + const IndexType num_rows, const ValueType *__restrict__ alpha, + const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, + const ValueType *__restrict__ b, const size_type b_stride, + const ValueType *__restrict__ beta, ValueType *__restrict__ c, + const size_type c_stride, IndexType *__restrict__ row_out, + ValueType *__restrict__ val_out) +{ + const auto alpha_val = alpha[0]; + const auto beta_val = beta[0]; + merge_path_spmv( + num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride, + row_out, val_out, [&alpha_val](ValueType &x) { return alpha_val * x; }, + [&beta_val](ValueType &x) { return beta_val * x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_reduce( + const IndexType nwarps, const ValueType *__restrict__ last_val, + const IndexType *__restrict__ last_row, ValueType *__restrict__ c, + const size_type c_stride) +{ + merge_path_reduce(nwarps, last_val, last_row, c, c_stride, + [](ValueType &x) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_reduce( + const IndexType nwarps, const ValueType *__restrict__ last_val, + const IndexType *__restrict__ last_row, const ValueType *__restrict__ alpha, + ValueType *__restrict__ c, const size_type c_stride) +{ + const auto alpha_val = alpha[0]; + merge_path_reduce(nwarps, last_val, last_row, c, c_stride, + [&alpha_val](ValueType &x) { return alpha_val * x; }); +} + + +template +__device__ void device_classical_spmv(const size_type num_rows, + const ValueType *__restrict__ val, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, + const ValueType *__restrict__ b, + const size_type b_stride, + ValueType *__restrict__ c, + const size_type c_stride, Closure scale) +{ + auto subwarp_tile = + group::tiled_partition(group::this_thread_block()); + const auto subrow = thread::get_subwarp_num_flat(); + const auto subid = subwarp_tile.thread_rank(); + const auto column_id = blockIdx.y; + auto row = thread::get_subwarp_id_flat(); + for (; row < num_rows; row += subrow) { + const auto ind_end = row_ptrs[row + 1]; + ValueType temp_val = zero(); + for (auto ind = row_ptrs[row] + subid; ind < ind_end; + ind += subwarp_size) { + temp_val += val[ind] * b[col_idxs[ind] * b_stride + column_id]; + } + auto subwarp_result = reduce( + subwarp_tile, temp_val, + [](const ValueType &a, const ValueType &b) { return a + b; }); + if (subid == 0) { + c[row * c_stride + column_id] = + scale(subwarp_result, c[row * c_stride + column_id]); + } + } +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const ValueType *__restrict__ val, + const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b, + const size_type b_stride, ValueType *__restrict__ c, + const size_type c_stride) +{ + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride, + [](const ValueType &x, const ValueType &y) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const ValueType *__restrict__ alpha, + const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, + const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b, + const size_type b_stride, const ValueType *__restrict__ beta, + ValueType *__restrict__ c, const size_type c_stride) +{ + const auto alpha_val = alpha[0]; + const auto beta_val = beta[0]; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride, + [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { + return alpha_val * x + beta_val * y; + }); +} + + +template +__global__ __launch_bounds__(default_block_size) void spgeam_nnz( + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + const IndexType *__restrict__ b_row_ptrs, + const IndexType *__restrict__ b_col_idxs, IndexType num_rows, + IndexType *__restrict__ nnz) +{ + const auto row = thread::get_subwarp_id_flat(); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (row >= num_rows) { + return; + } + + const auto a_begin = a_row_ptrs[row]; + const auto b_begin = b_row_ptrs[row]; + const auto a_size = a_row_ptrs[row + 1] - a_begin; + const auto b_size = b_row_ptrs[row + 1] - b_begin; + IndexType count{}; + group_merge( + a_col_idxs + a_begin, a_size, b_col_idxs + b_begin, b_size, subwarp, + [&](IndexType, IndexType a_col, IndexType, IndexType b_col, IndexType, + bool valid) { + count += popcnt(subwarp.ballot(a_col != b_col && valid)); + return true; + }); + + if (subwarp.thread_rank() == 0) { + nnz[row] = count; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void spgeam( + const ValueType *__restrict__ palpha, + const IndexType *__restrict__ a_row_ptrs, + const IndexType *__restrict__ a_col_idxs, + const ValueType *__restrict__ a_vals, const ValueType *__restrict__ pbeta, + const IndexType *__restrict__ b_row_ptrs, + const IndexType *__restrict__ b_col_idxs, + const ValueType *__restrict__ b_vals, IndexType num_rows, + const IndexType *__restrict__ c_row_ptrs, + IndexType *__restrict__ c_col_idxs, ValueType *__restrict__ c_vals) +{ + const auto row = thread::get_subwarp_id_flat(); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (row >= num_rows) { + return; + } + + const auto alpha = palpha[0]; + const auto beta = pbeta[0]; + const auto lane = static_cast(subwarp.thread_rank()); + constexpr auto lanemask_full = + ~config::lane_mask_type{} >> (config::warp_size - subwarp_size); + const auto lanemask_eq = config::lane_mask_type{1} << lane; + const auto lanemask_lt = lanemask_eq - 1; + + const auto a_begin = a_row_ptrs[row]; + const auto b_begin = b_row_ptrs[row]; + const auto a_size = a_row_ptrs[row + 1] - a_begin; + const auto b_size = b_row_ptrs[row + 1] - b_begin; + auto c_begin = c_row_ptrs[row]; + bool skip_first{}; + group_merge( + a_col_idxs + a_begin, a_size, b_col_idxs + b_begin, b_size, subwarp, + [&](IndexType a_nz, IndexType a_col, IndexType b_nz, IndexType b_col, + IndexType, bool valid) { + auto c_col = min(a_col, b_col); + auto equal_mask = subwarp.ballot(a_col == b_col && valid); + // check if the elements in the previous merge step are + // equal + auto prev_equal_mask = equal_mask << 1 | skip_first; + // store the highest bit for the next group_merge_step + skip_first = bool(equal_mask >> (subwarp_size - 1)); + auto prev_equal = bool(prev_equal_mask & lanemask_eq); + // only output an entry if the previous cols weren't equal. + // if they were equal, they were both handled in the + // previous step + if (valid && !prev_equal) { + auto c_ofs = popcnt(~prev_equal_mask & lanemask_lt); + c_col_idxs[c_begin + c_ofs] = c_col; + auto a_val = + a_col <= b_col ? a_vals[a_nz + a_begin] : zero(); + auto b_val = + b_col <= a_col ? b_vals[b_nz + b_begin] : zero(); + c_vals[c_begin + c_ofs] = alpha * a_val + beta * b_val; + } + // advance by the number of merged elements + // in theory, we would need to mask by `valid`, but this + // would only be false somwhere in the last iteration, where + // we don't need the value of c_begin afterwards, anyways. + c_begin += popcnt(~prev_equal_mask & lanemask_full); + return true; + }); +} + + +template +__global__ __launch_bounds__(default_block_size) void convert_row_ptrs_to_idxs( + size_type num_rows, const IndexType *__restrict__ ptrs, + IndexType *__restrict__ idxs) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) { + idxs[i] = tidx; + } + } +} + + +template +__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ result) +{ + const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; + const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; + if (tidx_x < num_cols && tidx_y < num_rows) { + result[tidx_y * stride + tidx_x] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_dense( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, size_type stride, + ValueType *__restrict__ result) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + for (auto i = row_ptrs[tidx]; i < row_ptrs[tidx + 1]; i++) { + result[stride * tidx + col_idxs[i]] = values[i]; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void calculate_nnz_per_row( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + size_type *__restrict__ nnz_per_row) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx]; + } +} + + +__global__ __launch_bounds__(config::warp_size) void calculate_slice_lengths( + size_type num_rows, size_type slice_size, size_type stride_factor, + const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets) +{ + constexpr auto warp_size = config::warp_size; + const auto sliceid = blockIdx.x; + const auto tid_in_warp = threadIdx.x; + + if (sliceid * slice_size + tid_in_warp < num_rows) { + size_type thread_result = 0; + for (int i = tid_in_warp; i < slice_size; i += warp_size) { + thread_result = + (i + slice_size * sliceid < num_rows) + ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) + : thread_result; + } + + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + auto warp_result = reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0) { + auto slice_length = + ceildiv(warp_result, stride_factor) * stride_factor; + slice_lengths[sliceid] = slice_length; + slice_sets[sliceid] = slice_length; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_sellp( + size_type num_rows, size_type slice_size, + const ValueType *__restrict__ source_values, + const IndexType *__restrict__ source_row_ptrs, + const IndexType *__restrict__ source_col_idxs, + size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets, + IndexType *__restrict__ result_col_idxs, + ValueType *__restrict__ result_values) +{ + const auto global_row = thread::get_thread_id_flat(); + const auto row = global_row % slice_size; + const auto sliceid = global_row / slice_size; + + if (global_row < num_rows) { + size_type sellp_ind = slice_sets[sliceid] * slice_size + row; + + for (size_type csr_ind = source_row_ptrs[global_row]; + csr_ind < source_row_ptrs[global_row + 1]; csr_ind++) { + result_values[sellp_ind] = source_values[csr_ind]; + result_col_idxs[sellp_ind] = source_col_idxs[csr_ind]; + sellp_ind += slice_size; + } + for (size_type i = sellp_ind; + i < + (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; + i += slice_size) { + result_col_idxs[i] = 0; + result_values[i] = zero(); + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void initialize_zero_ell( + size_type max_nnz_per_row, size_type stride, ValueType *__restrict__ values, + IndexType *__restrict__ col_idxs) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < stride * max_nnz_per_row) { + values[tidx] = zero(); + col_idxs[tidx] = 0; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_ell( + size_type num_rows, size_type stride, + const ValueType *__restrict__ source_values, + const IndexType *__restrict__ source_row_ptrs, + const IndexType *__restrict__ source_col_idxs, + ValueType *__restrict__ result_values, + IndexType *__restrict__ result_col_idxs) +{ + constexpr auto warp_size = config::warp_size; + const auto row = thread::get_subwarp_id_flat(); + const auto local_tidx = threadIdx.x % warp_size; + + if (row < num_rows) { + for (size_type i = local_tidx; + i < source_row_ptrs[row + 1] - source_row_ptrs[row]; + i += warp_size) { + const auto result_idx = row + stride * i; + const auto source_idx = i + source_row_ptrs[row]; + result_values[result_idx] = source_values[source_idx]; + result_col_idxs[result_idx] = source_col_idxs[source_idx]; + } + } +} + + +__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice( + size_type num_rows, size_type slice_size, size_type stride_factor, + const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result) +{ + constexpr auto warp_size = config::warp_size; + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + const auto warpid = thread::get_subwarp_id_flat(); + const auto tid_in_warp = warp_tile.thread_rank(); + const auto slice_num = ceildiv(num_rows, slice_size); + + size_type thread_result = 0; + for (auto i = tid_in_warp; i < slice_size; i += warp_size) { + if (warpid * slice_size + i < num_rows) { + thread_result = + max(thread_result, nnz_per_row[warpid * slice_size + i]); + } + } + auto warp_result = reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0 && warpid < slice_num) { + result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; + } +} + + +__global__ __launch_bounds__(default_block_size) void reduce_total_cols( + size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, + size_type *__restrict__ result) +{ + __shared__ size_type block_result[default_block_size]; + + reduce_array(num_slices, max_nnz_per_slice, block_result, + [](const size_type &x, const size_type &y) { return x + y; }); + + if (threadIdx.x == 0) { + result[blockIdx.x] = block_result[0]; + } +} + + +__global__ __launch_bounds__(default_block_size) void reduce_max_nnz( + size_type size, const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ result) +{ + __shared__ size_type block_max[default_block_size]; + + reduce_array( + size, nnz_per_row, block_max, + [](const size_type &x, const size_type &y) { return max(x, y); }); + + if (threadIdx.x == 0) { + result[blockIdx.x] = block_max[0]; + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void calculate_hybrid_coo_row_nnz( + size_type num_rows, size_type ell_max_nnz_per_row, + IndexType *__restrict__ csr_row_idxs, + size_type *__restrict__ coo_row_nnz) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx]; + coo_row_nnz[tidx] = + (csr_nnz > ell_max_nnz_per_row) * (csr_nnz - ell_max_nnz_per_row); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_hybrid( + size_type num_rows, size_type stride, size_type ell_max_nnz_per_row, + const ValueType *__restrict__ source_values, + const IndexType *__restrict__ source_row_ptrs, + const IndexType *__restrict__ source_col_idxs, + const size_type *__restrict__ coo_offset, + ValueType *__restrict__ result_ell_val, + IndexType *__restrict__ result_ell_col, + ValueType *__restrict__ result_coo_val, + IndexType *__restrict__ result_coo_col, + IndexType *__restrict__ result_coo_row) +{ + constexpr auto warp_size = config::warp_size; + const auto row = thread::get_subwarp_id_flat(); + const auto local_tidx = threadIdx.x % warp_size; + + if (row < num_rows) { + for (size_type i = local_tidx; + i < source_row_ptrs[row + 1] - source_row_ptrs[row]; + i += warp_size) { + const auto source_idx = i + source_row_ptrs[row]; + if (i < ell_max_nnz_per_row) { + const auto result_idx = row + stride * i; + result_ell_val[result_idx] = source_values[source_idx]; + result_ell_col[result_idx] = source_col_idxs[source_idx]; + } else { + const auto result_idx = + coo_offset[row] + i - ell_max_nnz_per_row; + result_coo_val[result_idx] = source_values[source_idx]; + result_coo_col[result_idx] = source_col_idxs[source_idx]; + result_coo_row[result_idx] = row; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void check_unsorted( + const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, IndexType num_rows, bool *flag) +{ + __shared__ bool sh_flag; + auto block = group::this_thread_block(); + if (block.thread_rank() == 0) { + sh_flag = *flag; + } + block.sync(); + + auto row = thread::get_thread_id_flat(); + if (row >= num_rows) { + return; + } + + // fail early + if (sh_flag) { + for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) { + if (col_idxs[nz] > col_idxs[nz + 1]) { + *flag = false; + sh_flag = false; + return; + } + } + } +} + + +} // namespace kernel + + +namespace { + + +template +__global__ __launch_bounds__(default_block_size) void conjugate_kernel( + size_type num_nonzeros, ValueType *__restrict__ val) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_nonzeros) { + val[tidx] = conj(val[tidx]); + } +} + + +} // namespace diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc new file mode 100644 index 00000000000..95e0c4ed7b2 --- /dev/null +++ b/common/matrix/dense_kernels.hpp.inc @@ -0,0 +1,484 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(block_size) void scale( + size_type num_rows, size_type num_cols, size_type num_alpha_cols, + const ValueType *__restrict__ alpha, ValueType *__restrict__ x, + size_type stride_x) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; + if (row_id < num_rows) { + x[row_id * stride_x + col_id] = + alpha[alpha_id] == zero() + ? zero() + : x[row_id * stride_x + col_id] * alpha[alpha_id]; + } +} + + +template +__global__ __launch_bounds__(block_size) void add_scaled( + size_type num_rows, size_type num_cols, size_type num_alpha_cols, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ x, + size_type stride_x, ValueType *__restrict__ y, size_type stride_y) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; + if (row_id < num_rows && alpha[alpha_id] != zero()) { + y[row_id * stride_y + col_id] += + x[row_id * stride_x + col_id] * alpha[alpha_id]; + } +} + + +template +__device__ void compute_partial_reduce(size_type num_rows, + OutType *__restrict__ work, + CallableGetValue get_value, + CallableReduce reduce_op) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + + const auto num_blocks = gridDim.x; + const auto local_id = thread::get_local_thread_id(); + const auto global_id = + thread::get_thread_id(); + + auto tmp = zero(); + for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { + tmp = reduce_op(tmp, get_value(i)); + } + __shared__ UninitializedArray tmp_work; + tmp_work[local_id] = tmp; + + reduce(group::this_thread_block(), static_cast(tmp_work), + reduce_op); + + if (local_id == 0) { + work[thread::get_block_id()] = tmp_work[0]; + } +} + + +template +__device__ void finalize_reduce_computation(size_type size, + const ValueType *work, + ValueType *result, + CallableReduce reduce_op, + CallableFinalize finalize_op) +{ + const auto local_id = thread::get_local_thread_id(); + + ValueType tmp = zero(); + for (auto i = local_id; i < size; i += block_size) { + tmp = reduce_op(tmp, work[i]); + } + __shared__ UninitializedArray tmp_work; + tmp_work[local_id] = tmp; + + reduce(group::this_thread_block(), static_cast(tmp_work), + reduce_op); + + if (local_id == 0) { + *result = finalize_op(tmp_work[0]); + } +} + + +template +__global__ __launch_bounds__(block_size) void compute_partial_dot( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ work) +{ + compute_partial_reduce( + num_rows, work, + [x, stride_x, y, stride_y](size_type i) { + return x[i * stride_x] * conj(y[i * stride_y]); + }, + [](const ValueType &x, const ValueType &y) { return x + y; }); +} + + +template +__global__ __launch_bounds__(block_size) void finalize_dot_computation( + size_type size, const ValueType *work, ValueType *result) +{ + finalize_reduce_computation( + size, work, result, + [](const ValueType &x, const ValueType &y) { return x + y; }, + [](const ValueType &x) { return x; }); +} + + +template +__global__ __launch_bounds__(block_size) void compute_partial_norm2( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + remove_complex *__restrict__ work) +{ + using norm_type = remove_complex; + compute_partial_reduce( + num_rows, work, + [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, + [](const norm_type &x, const norm_type &y) { return x + y; }); +} + + +template +__global__ __launch_bounds__(block_size) void finalize_norm2_computation( + size_type size, const ValueType *work, ValueType *result) +{ + finalize_reduce_computation( + size, work, result, + [](const ValueType &x, const ValueType &y) { return x + y; }, + [](const ValueType &x) { return sqrt(x); }); +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_coo( + size_type num_rows, size_type num_cols, size_type stride, + const size_type *__restrict__ row_ptrs, + const ValueType *__restrict__ source, IndexType *__restrict__ row_idxs, + IndexType *__restrict__ col_idxs, ValueType *__restrict__ values) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + size_type write_to = row_ptrs[tidx]; + + for (size_type i = 0; i < num_cols; i++) { + if (source[stride * tidx + i] != zero()) { + values[write_to] = source[stride * tidx + i]; + col_idxs[write_to] = i; + row_idxs[write_to] = tidx; + write_to++; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ work, IndexType *__restrict__ result) +{ + constexpr auto warp_size = config::warp_size; + const auto row_idx = thread::get_subwarp_id_flat(); + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + + if (row_idx < num_rows) { + IndexType part_result{}; + for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) { + if (work[stride * row_idx + i] != zero()) { + part_result += 1; + } + } + result[row_idx] = reduce( + warp_tile, part_result, + [](const size_type &a, const size_type &b) { return a + b; }); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_csr( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs, + IndexType *__restrict__ col_idxs, ValueType *__restrict__ values) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_rows) { + auto write_to = row_ptrs[tidx]; + for (auto i = 0; i < num_cols; i++) { + if (source[stride * tidx + i] != zero()) { + values[write_to] = source[stride * tidx + i]; + col_idxs[write_to] = i; + write_to++; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_ell( + size_type num_rows, size_type num_cols, size_type source_stride, + const ValueType *__restrict__ source, size_type max_nnz_per_row, + size_type result_stride, IndexType *__restrict__ col_ptrs, + ValueType *__restrict__ values) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + IndexType col_idx = 0; + for (size_type col = 0; col < num_cols; col++) { + if (source[tidx * source_stride + col] != zero()) { + col_ptrs[col_idx * result_stride + tidx] = col; + values[col_idx * result_stride + tidx] = + source[tidx * source_stride + col]; + col_idx++; + } + } + for (size_type j = col_idx; j < max_nnz_per_row; j++) { + col_ptrs[j * result_stride + tidx] = 0; + values[j * result_stride + tidx] = zero(); + } + } else if (tidx < result_stride) { + for (size_type j = 0; j < max_nnz_per_row; j++) { + col_ptrs[j * result_stride + tidx] = 0; + values[j * result_stride + tidx] = zero(); + } + } +} + + +__global__ __launch_bounds__(config::warp_size) void calculate_slice_lengths( + size_type num_rows, size_type slice_size, int slice_num, + size_type stride_factor, const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets) +{ + constexpr auto warp_size = config::warp_size; + const auto sliceid = blockIdx.x; + const auto tid_in_warp = threadIdx.x; + + if (sliceid * slice_size + tid_in_warp < num_rows) { + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { + thread_result = + (i + slice_size * sliceid < num_rows) + ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) + : thread_result; + } + + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + auto warp_result = reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0) { + auto slice_length = + ceildiv(warp_result, stride_factor) * stride_factor; + slice_lengths[sliceid] = slice_length; + slice_sets[sliceid] = slice_length; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_sellp( + size_type num_rows, size_type num_cols, size_type slice_size, + size_type stride, const ValueType *__restrict__ source, + size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets, + IndexType *__restrict__ col_idxs, ValueType *__restrict__ vals) +{ + const auto global_row = thread::get_thread_id_flat(); + const auto row = global_row % slice_size; + const auto sliceid = global_row / slice_size; + + if (global_row < num_rows) { + size_type sellp_ind = slice_sets[sliceid] * slice_size + row; + + for (size_type col = 0; col < num_cols; col++) { + auto val = source[global_row * stride + col]; + if (val != zero()) { + col_idxs[sellp_ind] = col; + vals[sellp_ind] = val; + sellp_ind += slice_size; + } + } + for (size_type i = sellp_ind; + i < + (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; + i += slice_size) { + col_idxs[i] = 0; + vals[i] = zero(); + } + } +} + + +__global__ __launch_bounds__(default_block_size) void reduce_max_nnz( + size_type size, const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ result) +{ + extern __shared__ size_type block_max[]; + + reduce_array( + size, nnz_per_row, block_max, + [](const size_type &x, const size_type &y) { return max(x, y); }); + + if (threadIdx.x == 0) { + result[blockIdx.x] = block_max[0]; + } +} + + +__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice( + size_type num_rows, size_type slice_size, size_type stride_factor, + const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result) +{ + constexpr auto warp_size = config::warp_size; + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + const auto warpid = thread::get_subwarp_id_flat(); + const auto tid_in_warp = warp_tile.thread_rank(); + const auto slice_num = ceildiv(num_rows, slice_size); + + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { + if (warpid * slice_size + i < num_rows) { + thread_result = + max(thread_result, nnz_per_row[warpid * slice_size + i]); + } + } + + auto warp_result = reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0 && warpid < slice_num) { + result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; + } +} + + +__global__ __launch_bounds__(default_block_size) void reduce_total_cols( + size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, + size_type *__restrict__ result) +{ + extern __shared__ size_type block_result[]; + + reduce_array(num_slices, max_nnz_per_slice, block_result, + [](const size_type &x, const size_type &y) { return x + y; }); + + if (threadIdx.x == 0) { + result[blockIdx.x] = block_result[0]; + } +} + + +template +__global__ __launch_bounds__(block_size) void row_permute( + size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig, + size_type stride_orig, ValueType *__restrict__ result, + size_type stride_result) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + col_id] = + orig[perm_idxs[row_id] * stride_orig + col_id]; + } +} + + +template +__global__ __launch_bounds__(block_size) void column_permute( + size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig, + size_type stride_orig, ValueType *__restrict__ result, + size_type stride_result) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + col_id] = + orig[row_id * stride_orig + perm_idxs[col_id]]; + } +} + + +template +__global__ __launch_bounds__(block_size) void inverse_row_permute( + size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig, + size_type stride_orig, ValueType *__restrict__ result, + size_type stride_result) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[perm_idxs[row_id] * stride_result + col_id] = + orig[row_id * stride_orig + col_id]; + } +} + + +template +__global__ __launch_bounds__(block_size) void inverse_column_permute( + size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, const ValueType *__restrict__ orig, + size_type stride_orig, ValueType *__restrict__ result, + size_type stride_result) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + perm_idxs[col_id]] = + orig[row_id * stride_orig + col_id]; + } +} + + +} // namespace kernel diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc new file mode 100644 index 00000000000..8b569b650c9 --- /dev/null +++ b/common/matrix/ell_kernels.hpp.inc @@ -0,0 +1,240 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { +namespace { + + +template +__device__ void spmv_kernel( + const size_type num_rows, const int num_worker_per_row, + const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride, Closure op) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto column_id = blockIdx.y; + if (num_thread_per_worker == 1) { + // Specialize the num_thread_per_worker = 1. It doesn't need the shared + // memory, __syncthreads, and atomic_add + if (tidx < num_rows) { + ValueType temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx < idx) { + break; + } else { + temp += val[ind] * b[col_idx * b_stride + column_id]; + } + } + const auto c_ind = tidx * c_stride + column_id; + c[c_ind] = op(temp, c[c_ind]); + } + } else { + if (tidx < num_worker_per_row * num_rows) { + const auto idx_in_worker = threadIdx.y; + const auto x = tidx % num_rows; + const auto worker_id = tidx / num_rows; + const auto step_size = num_worker_per_row * num_thread_per_worker; + __shared__ UninitializedArray + storage; + if (idx_in_worker == 0) { + storage[threadIdx.x] = 0; + } + __syncthreads(); + ValueType temp = zero(); + for (size_type idx = + worker_id * num_thread_per_worker + idx_in_worker; + idx < num_stored_elements_per_row; idx += step_size) { + const auto ind = x + idx * stride; + const auto col_idx = col[ind]; + if (col_idx < idx) { + break; + } else { + temp += val[ind] * b[col_idx * b_stride + column_id]; + } + } + atomic_add(&storage[threadIdx.x], temp); + __syncthreads(); + if (idx_in_worker == 0) { + const auto c_ind = x * c_stride + column_id; + if (atomic) { + atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind])); + } else { + c[c_ind] = op(storage[threadIdx.x], c[c_ind]); + } + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void spmv( + const size_type num_rows, const int num_worker_per_row, + const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + const ValueType *__restrict__ b, const size_type b_stride, + ValueType *__restrict__ c, const size_type c_stride) +{ + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, b_stride, c, c_stride, + [](const ValueType &x, const ValueType &y) { return x; }); +} + + +template +__global__ __launch_bounds__(default_block_size) void spmv( + const size_type num_rows, const int num_worker_per_row, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, + const IndexType *__restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, + const ValueType *__restrict__ b, const size_type b_stride, + const ValueType *__restrict__ beta, ValueType *__restrict__ c, + const size_type c_stride) +{ + const ValueType alpha_val = alpha[0]; + const ValueType beta_val = beta[0]; + // Because the atomic operation changes the values of c during computation, + // it can not do the right alpha * a * b + beta * c operation. + // Thus, the cuda kernel only computes alpha * a * b when it uses atomic + // operation. + if (atomic) { + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, b_stride, c, c_stride, + [&alpha_val](const ValueType &x, const ValueType &y) { + return alpha_val * x; + }); + } else { + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, b_stride, c, c_stride, + [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { + return alpha_val * x + beta_val * y; + }); + } +} + + +} // namespace + + +template +__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ result) +{ + const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; + const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; + if (tidx_x < num_cols && tidx_y < num_rows) { + result[tidx_y * stride + tidx_x] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_dense( + size_type num_rows, size_type nnz, size_type source_stride, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, size_type result_stride, + ValueType *__restrict__ result) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_rows) { + for (auto col = 0; col < nnz; col++) { + result[tidx * result_stride + + col_idxs[tidx + col * source_stride]] += + values[tidx + col * source_stride]; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( + size_type num_rows, size_type max_nnz_per_row, size_type stride, + const ValueType *__restrict__ values, IndexType *__restrict__ result) +{ + constexpr auto warp_size = config::warp_size; + const auto row_idx = thread::get_subwarp_id_flat(); + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + + if (row_idx < num_rows) { + IndexType part_result{}; + for (auto i = warp_tile.thread_rank(); i < max_nnz_per_row; + i += warp_size) { + if (values[stride * i + row_idx] != zero()) { + part_result += 1; + } + } + result[row_idx] = reduce( + warp_tile, part_result, + [](const size_type &a, const size_type &b) { return a + b; }); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_csr( + size_type num_rows, size_type max_nnz_per_row, size_type stride, + const ValueType *__restrict__ source_values, + const IndexType *__restrict__ source_col_idxs, + IndexType *__restrict__ result_row_ptrs, + IndexType *__restrict__ result_col_idxs, + ValueType *__restrict__ result_values) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_rows) { + auto write_to = result_row_ptrs[tidx]; + for (auto i = 0; i < max_nnz_per_row; i++) { + const auto source_idx = tidx + stride * i; + if (source_values[source_idx] != zero()) { + result_values[write_to] = source_values[source_idx]; + result_col_idxs[write_to] = source_col_idxs[source_idx]; + write_to++; + } + } + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc new file mode 100644 index 00000000000..a2c9d2c7ae4 --- /dev/null +++ b/common/matrix/hybrid_kernels.hpp.inc @@ -0,0 +1,142 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +/** + * The global function for counting the number of nonzeros per row of COO. + * It is almost like COO spmv routine. + * It performs is_nonzeros(Coo) times the vector whose values are one + * + * @param nnz the number of nonzeros in the matrix + * @param num_line the maximum round of each warp + * @param val the value array of the matrix + * @param row the row index array of the matrix + * @param nnz_per_row the output nonzeros per row + */ +template +__global__ __launch_bounds__(default_block_size) void count_coo_row_nnz( + const size_type nnz, const size_type num_lines, + const ValueType *__restrict__ val, const IndexType *__restrict__ row, + IndexType *__restrict__ nnz_per_row) +{ + IndexType temp_val = 0; + const auto start = static_cast(blockDim.x) * blockIdx.x * + blockDim.y * num_lines + + threadIdx.y * blockDim.x * num_lines; + size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size); + num = min(num, num_lines); + const IndexType ind_start = start + threadIdx.x; + const IndexType ind_end = ind_start + (num - 1) * subwarp_size; + IndexType ind = ind_start; + IndexType curr_row = (ind < nnz) ? row[ind] : 0; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + for (; ind < ind_end; ind += subwarp_size) { + temp_val += ind < nnz && val[ind] != zero(); + auto next_row = + (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; + // segmented scan + if (tile_block.any(curr_row != next_row)) { + bool is_first_in_segment = + segment_scan(tile_block, curr_row, &temp_val); + if (is_first_in_segment) { + atomic_add(&(nnz_per_row[curr_row]), temp_val); + } + temp_val = 0; + } + curr_row = next_row; + } + if (num > 0) { + ind = ind_end; + temp_val += ind < nnz && val[ind] != zero(); + // segmented scan + + bool is_first_in_segment = + segment_scan(tile_block, curr_row, &temp_val); + if (is_first_in_segment) { + atomic_add(&(nnz_per_row[curr_row]), temp_val); + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_csr( + size_type num_rows, size_type max_nnz_per_row, size_type stride, + const ValueType *__restrict__ ell_val, + const IndexType *__restrict__ ell_col, + const ValueType *__restrict__ coo_val, + const IndexType *__restrict__ coo_col, + const IndexType *__restrict__ coo_offset, + IndexType *__restrict__ result_row_ptrs, + IndexType *__restrict__ result_col_idxs, + ValueType *__restrict__ result_values) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_rows) { + auto write_to = result_row_ptrs[tidx]; + for (auto i = 0; i < max_nnz_per_row; i++) { + const auto source_idx = tidx + stride * i; + if (ell_val[source_idx] != zero()) { + result_values[write_to] = ell_val[source_idx]; + result_col_idxs[write_to] = ell_col[source_idx]; + write_to++; + } + } + for (auto i = coo_offset[tidx]; i < coo_offset[tidx + 1]; i++) { + if (coo_val[i] != zero()) { + result_values[write_to] = coo_val[i]; + result_col_idxs[write_to] = coo_col[i]; + write_to++; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void add( + size_type num, ValueType1 *__restrict__ val1, + const ValueType2 *__restrict__ val2) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num) { + val1[tidx] += val2[tidx]; + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/matrix/sellp_kernels.hpp.inc b/common/matrix/sellp_kernels.hpp.inc new file mode 100644 index 00000000000..d1a0bee9d12 --- /dev/null +++ b/common/matrix/sellp_kernels.hpp.inc @@ -0,0 +1,199 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace { + + +template +__global__ __launch_bounds__(matrix::default_slice_size) void spmv_kernel( + size_type num_rows, size_type num_right_hand_sides, size_type b_stride, + size_type c_stride, const size_type *__restrict__ slice_lengths, + const size_type *__restrict__ slice_sets, const ValueType *__restrict__ a, + const IndexType *__restrict__ col, const ValueType *__restrict__ b, + ValueType *__restrict__ c) +{ + const auto slice_id = blockIdx.x; + const auto slice_size = blockDim.x; + const auto row_in_slice = threadIdx.x; + const auto global_row = + static_cast(slice_size) * slice_id + row_in_slice; + const auto column_id = blockIdx.y; + ValueType val = 0; + IndexType ind = 0; + if (global_row < num_rows && column_id < num_right_hand_sides) { + for (size_type i = 0; i < slice_lengths[slice_id]; i++) { + ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; + val += a[ind] * b[col[ind] * b_stride + column_id]; + } + c[global_row * c_stride + column_id] = val; + } +} + + +template +__global__ + __launch_bounds__(matrix::default_slice_size) void advanced_spmv_kernel( + size_type num_rows, size_type num_right_hand_sides, size_type b_stride, + size_type c_stride, const size_type *__restrict__ slice_lengths, + const size_type *__restrict__ slice_sets, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ a, + const IndexType *__restrict__ col, const ValueType *__restrict__ b, + const ValueType *__restrict__ beta, ValueType *__restrict__ c) +{ + const auto slice_id = blockIdx.x; + const auto slice_size = blockDim.x; + const auto row_in_slice = threadIdx.x; + const auto global_row = + static_cast(slice_size) * slice_id + row_in_slice; + const auto column_id = blockIdx.y; + ValueType val = 0; + IndexType ind = 0; + if (global_row < num_rows && column_id < num_right_hand_sides) { + for (size_type i = 0; i < slice_lengths[slice_id]; i++) { + ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; + val += alpha[0] * a[ind] * b[col[ind] * b_stride + column_id]; + } + c[global_row * c_stride + column_id] = + beta[0] * c[global_row * c_stride + column_id] + val; + } +} + + +} // namespace + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void initialize_zero_dense( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ result) +{ + const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; + const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; + if (tidx_x < num_cols && tidx_y < num_rows) { + result[tidx_y * stride + tidx_x] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_dense( + size_type num_rows, size_type num_cols, size_type stride, + size_type slice_size, const size_type *__restrict__ slice_lengths, + const size_type *__restrict__ slice_sets, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, ValueType *__restrict__ result) +{ + const auto global_row = thread::get_subwarp_id_flat(); + const auto row = global_row % slice_size; + const auto slice = global_row / slice_size; + const auto start_index = threadIdx.x % threads_per_row; + + if (global_row < num_rows) { + for (auto i = start_index; i < slice_lengths[slice]; + i += threads_per_row) { + if (values[(slice_sets[slice] + i) * slice_size + row] != + zero()) { + result[global_row * stride + + col_idxs[(slice_sets[slice] + i) * slice_size + row]] = + values[(slice_sets[slice] + i) * slice_size + row]; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( + size_type num_rows, size_type slice_size, + const size_type *__restrict__ slice_sets, + const ValueType *__restrict__ values, IndexType *__restrict__ result) +{ + constexpr auto warp_size = config::warp_size; + auto warp_tile = + group::tiled_partition(group::this_thread_block()); + const auto row_idx = thread::get_subwarp_id_flat(); + const auto slice_id = row_idx / slice_size; + const auto tid_in_warp = warp_tile.thread_rank(); + const auto row_in_slice = row_idx % slice_size; + + if (row_idx < num_rows) { + IndexType part_result{}; + for (size_type sellp_ind = + (slice_sets[slice_id] + tid_in_warp) * slice_size + + row_in_slice; + sellp_ind < slice_sets[slice_id + 1] * slice_size; + sellp_ind += warp_size * slice_size) { + if (values[sellp_ind] != zero()) { + part_result += 1; + } + } + result[row_idx] = reduce( + warp_tile, part_result, + [](const size_type &a, const size_type &b) { return a + b; }); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void fill_in_csr( + size_type num_rows, size_type slice_size, + const size_type *__restrict__ source_slice_sets, + const IndexType *__restrict__ source_col_idxs, + const ValueType *__restrict__ source_values, + IndexType *__restrict__ result_row_ptrs, + IndexType *__restrict__ result_col_idxs, + ValueType *__restrict__ result_values) +{ + const auto row = thread::get_thread_id_flat(); + const auto slice_id = row / slice_size; + const auto row_in_slice = row % slice_size; + + if (row < num_rows) { + size_type csr_ind = result_row_ptrs[row]; + for (size_type sellp_ind = + source_slice_sets[slice_id] * slice_size + row_in_slice; + sellp_ind < source_slice_sets[slice_id + 1] * slice_size; + sellp_ind += slice_size) { + if (source_values[sellp_ind] != zero()) { + result_values[csr_ind] = source_values[sellp_ind]; + result_col_idxs[csr_ind] = source_col_idxs[sellp_ind]; + csr_ind++; + } + } + } +} + + +} // namespace kernel \ No newline at end of file diff --git a/common/preconditioner/isai_kernels.hpp.inc b/common/preconditioner/isai_kernels.hpp.inc new file mode 100644 index 00000000000..9eec6afaa04 --- /dev/null +++ b/common/preconditioner/isai_kernels.hpp.inc @@ -0,0 +1,336 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +/** + * @internal + * + * This kernel supports at most `subwarp_size` (< `warp_size`) elements per row. + * If there are more elements, they are simply ignored. Only the first + * `subwarp_size` elements are considered both for the values and for the + * sparsity pattern. + */ +template +__forceinline__ __device__ void generic_generate( + IndexType num_rows, const IndexType *__restrict__ m_row_ptrs, + const IndexType *__restrict__ m_col_idxs, + const ValueType *__restrict__ m_values, + const IndexType *__restrict__ i_row_ptrs, + const IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values, + IndexType *__restrict__ excess_rhs_sizes, + IndexType *__restrict__ excess_nnz, Callable trs_solve) +{ + static_assert(subwarp_size >= row_size_limit, "incompatible subwarp_size"); + const auto row = thread::get_subwarp_id_flat(); + + if (row >= num_rows) { + return; + } + + const auto i_row_begin = i_row_ptrs[row]; + const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin; + + auto subwarp = + group::tiled_partition(group::this_thread_block()); + const int local_id = subwarp.thread_rank(); + + if (i_row_size > subwarp_size) { + // defer long rows: store their nnz and number of matches + IndexType count{}; + for (IndexType nz = 0; nz < i_row_size; ++nz) { + auto col = i_col_idxs[i_row_begin + nz]; + auto m_row_begin = m_row_ptrs[col]; + auto m_row_size = m_row_ptrs[col + 1] - m_row_begin; + // extract the sparse submatrix consisting of the entries whose + // columns/rows match column indices from this row + group_match( + m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin, + i_row_size, subwarp, + [&](IndexType, IndexType, IndexType, + config::lane_mask_type matchmask, + bool) { count += popcnt(matchmask); }); + } + // store the dim and nnz of this sparse block + if (local_id == 0) { + excess_rhs_sizes[row] = i_row_size; + excess_nnz[row] = count; + } + } else { + // handle short rows directly: no excess + if (local_id == 0) { + excess_rhs_sizes[row] = 0; + excess_nnz[row] = 0; + } + + // subwarp_size^2 storage per subwarp + __shared__ UninitializedArray + storage; + + auto trisystem_ptr = storage + (threadIdx.x / subwarp_size) * + subwarp_size * subwarp_size; + // row-major accessor + auto trisystem = [&](IndexType row, IndexType col) -> ValueType & { + return trisystem_ptr[row * subwarp_size + col]; + }; + +#pragma unroll + for (int i = 0; i < subwarp_size; ++i) { + trisystem(i, local_id) = zero(); + } + + subwarp.sync(); + + for (IndexType nz = 0; nz < i_row_size; ++nz) { + auto col = i_col_idxs[i_row_begin + nz]; + auto m_row_begin = m_row_ptrs[col]; + auto m_row_size = m_row_ptrs[col + 1] - m_row_begin; + // extract the dense submatrix consisting of the entries whose + // columns/rows match column indices from this row + group_match( + m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin, + i_row_size, subwarp, + [&](IndexType, IndexType m_idx, IndexType i_idx, + config::lane_mask_type, bool valid) { + if (valid) { + trisystem(nz, i_idx) = m_values[m_row_begin + m_idx]; + } + }); + } + + subwarp.sync(); + + // Now, read a full col of `trisystem` into local registers, which will + // be row elements after this (implicit) transpose + ValueType local_row[subwarp_size]; +#pragma unroll + for (int i = 0; i < subwarp_size; ++i) { + local_row[i] = trisystem(i, local_id); + } + + const auto rhs = trs_solve(i_row_size, local_row, subwarp); + + // Write back: + if (local_id < i_row_size) { + const auto idx = i_row_begin + local_id; + if (is_finite(rhs)) { + i_values[idx] = rhs; + } else { + i_values[idx] = i_col_idxs[idx] == row ? one() + : zero(); + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void generate_l_inverse( + IndexType num_rows, const IndexType *__restrict__ m_row_ptrs, + const IndexType *__restrict__ m_col_idxs, + const ValueType *__restrict__ m_values, + const IndexType *__restrict__ i_row_ptrs, + const IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values, + IndexType *__restrict__ excess_rhs_sizes, + IndexType *__restrict__ excess_nnz) +{ + auto trs_solve = [](IndexType num_elems, + const ValueType *__restrict__ local_row, + group::thread_block_tile &subwarp) { + const int local_id = subwarp.thread_rank(); + ValueType rhs = + local_id == num_elems - 1 ? one() : zero(); + // Solve Triangular system + for (int d_col = num_elems - 1; d_col >= 0; --d_col) { + const auto elem = local_row[d_col]; + if (d_col == local_id) { + rhs /= elem; + } + + const ValueType bot = subwarp.shfl(rhs, d_col); + if (local_id < d_col) { + rhs -= bot * elem; + } + } + + return rhs; + }; + + generic_generate( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, + i_values, excess_rhs_sizes, excess_nnz, trs_solve); +} + + +template +__global__ __launch_bounds__(default_block_size) void generate_u_inverse( + IndexType num_rows, const IndexType *__restrict__ m_row_ptrs, + const IndexType *__restrict__ m_col_idxs, + const ValueType *__restrict__ m_values, + const IndexType *__restrict__ i_row_ptrs, + const IndexType *__restrict__ i_col_idxs, ValueType *__restrict__ i_values, + IndexType *__restrict__ excess_rhs_sizes, + IndexType *__restrict__ excess_nnz) +{ + auto trs_solve = [](IndexType num_elems, + const ValueType *__restrict__ local_row, + group::thread_block_tile &subwarp) { + const int local_id = subwarp.thread_rank(); + ValueType rhs = local_id == 0 ? one() : zero(); + // Solve Triangular system + for (int d_col = 0; d_col < num_elems; ++d_col) { + const auto elem = local_row[d_col]; + if (d_col == local_id) { + rhs /= elem; + } + + const ValueType top = subwarp.shfl(rhs, d_col); + if (d_col < local_id) { + rhs -= top * elem; + } + } + + return rhs; + }; + + generic_generate( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, + i_values, excess_rhs_sizes, excess_nnz, trs_solve); +} + + +template +__global__ __launch_bounds__(default_block_size) void generate_excess_system( + IndexType num_rows, const IndexType *__restrict__ m_row_ptrs, + const IndexType *__restrict__ m_col_idxs, + const ValueType *__restrict__ m_values, + const IndexType *__restrict__ i_row_ptrs, + const IndexType *__restrict__ i_col_idxs, + const IndexType *__restrict__ excess_rhs_ptrs, + const IndexType *__restrict__ excess_nz_ptrs, + IndexType *__restrict__ excess_row_ptrs, + IndexType *__restrict__ excess_col_idxs, + ValueType *__restrict__ excess_values, ValueType *__restrict__ excess_rhs) +{ + const auto row = thread::get_subwarp_id_flat(); + + if (row >= num_rows) { + return; + } + + const auto i_row_begin = i_row_ptrs[row]; + const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin; + + auto subwarp = + group::tiled_partition(group::this_thread_block()); + const int local_id = subwarp.thread_rank(); + const auto prefix_mask = (config::lane_mask_type{1} << local_id) - 1; + + if (row == 0 && local_id == 0) { + excess_row_ptrs[0] = 0; + } + + if (i_row_size <= subwarp_size) { + return; + } + + auto excess_rhs_begin = excess_rhs_ptrs[row]; + auto excess_nz_begin = excess_nz_ptrs[row]; + + // defer long rows: store their nnz and number of matches + for (IndexType nz = 0; nz < i_row_size; ++nz) { + auto col = i_col_idxs[i_row_begin + nz]; + auto m_row_begin = m_row_ptrs[col]; + auto m_row_size = m_row_ptrs[col + 1] - m_row_begin; + // extract the sparse submatrix consisting of the entries whose + // columns/rows match column indices from this row + group_match( + m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin, + i_row_size, subwarp, + [&](IndexType col, IndexType m_idx, IndexType i_idx, + config::lane_mask_type mask, bool valid) { + // trisystem(nz, i_idx) = m_values[m_row_begin + m_idx] + // only in sparse :) + if (valid) { + auto nz = excess_nz_begin + popcnt(mask & prefix_mask); + excess_col_idxs[nz] = excess_rhs_begin + i_idx; + excess_values[nz] = m_values[m_row_begin + m_idx]; + } + excess_nz_begin += popcnt(mask); + }); + if (local_id == 0) { + // build right-hand side: 1 for diagonal entry, 0 else + excess_rhs[excess_rhs_begin + nz] = + row == col ? one() : zero(); + // store row pointers + excess_row_ptrs[excess_rhs_begin + nz + 1] = excess_nz_begin; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void copy_excess_solution( + IndexType num_rows, const IndexType *__restrict__ i_row_ptrs, + const IndexType *__restrict__ excess_rhs_ptrs, + const ValueType *__restrict__ excess_solution, + ValueType *__restrict__ i_values) +{ + const auto row = thread::get_subwarp_id_flat(); + + if (row >= num_rows) { + return; + } + + auto local_id = threadIdx.x % subwarp_size; + + const auto i_row_begin = i_row_ptrs[row]; + + const auto excess_begin = excess_rhs_ptrs[row]; + const auto excess_size = excess_rhs_ptrs[row + 1] - excess_begin; + + // if it was handled separately: + if (excess_size > 0) { + // copy the values for this row + for (IndexType nz = local_id; nz < excess_size; nz += subwarp_size) { + i_values[nz + i_row_begin] = excess_solution[nz + excess_begin]; + } + } +} + + +} // namespace kernel diff --git a/common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc b/common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc new file mode 100644 index 00000000000..2426728d402 --- /dev/null +++ b/common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc @@ -0,0 +1,109 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ void __launch_bounds__(warps_per_block *config::warp_size) + advanced_apply(const ValueType *__restrict__ blocks, + preconditioner::block_interleaved_storage_scheme + storage_scheme, + const IndexType *__restrict__ block_ptrs, + size_type num_blocks, const ValueType *__restrict__ alpha, + const ValueType *__restrict__ b, int32 b_stride, + ValueType *__restrict__ x, int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = alpha[0] * + b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + multiply_vec( + subwarp, block_size, v, + blocks + storage_scheme.get_global_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType &result, const ValueType &out) { result += out; }); +} + + +template +__global__ void +__launch_bounds__(warps_per_block *config::warp_size) advanced_adaptive_apply( + const ValueType *__restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction *__restrict__ block_precisions, + const IndexType *__restrict__ block_ptrs, size_type num_blocks, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ b, + int32 b_stride, ValueType *__restrict__ x, int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + auto alpha_val = alpha == nullptr ? one() : alpha[0]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = alpha[0] * + b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + multiply_vec( + subwarp, block_size, v, + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType &result, const ValueType &out) { result += out; })); +} + + +} // namespace kernel diff --git a/common/preconditioner/jacobi_generate_kernel.hpp.inc b/common/preconditioner/jacobi_generate_kernel.hpp.inc new file mode 100644 index 00000000000..da8fe668aa0 --- /dev/null +++ b/common/preconditioner/jacobi_generate_kernel.hpp.inc @@ -0,0 +1,208 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__device__ __forceinline__ bool validate_precision_reduction_feasibility( + Group &__restrict__ group, IndexType block_size, + ValueType *__restrict__ row, ValueType *__restrict__ work, size_type stride) +{ + using gko::detail::float_traits; + // save original data and reduce precision + if (group.thread_rank() < block_size) { +#pragma unroll + for (auto i = 0u; i < max_block_size; ++i) { + if (i < block_size) { + work[i * stride + group.thread_rank()] = row[i]; + row[i] = + static_cast(static_cast(row[i])); + } + } + } + + // compute the condition number + auto perm = group.thread_rank(); + auto trans_perm = perm; + auto block_cond = compute_infinity_norm(group, block_size, + block_size, row); + auto succeeded = + invert_block(group, block_size, row, perm, trans_perm); + block_cond *= compute_infinity_norm(group, block_size, + block_size, row); + + // restore original data + if (group.thread_rank() < block_size) { +#pragma unroll + for (auto i = 0u; i < max_block_size; ++i) { + if (i < block_size) { + row[i] = work[i * stride + group.thread_rank()]; + } + } + } + + return succeeded && block_cond >= 1.0 && + block_cond * float_traits>::eps < 1e-3; +} + + +template +__global__ void __launch_bounds__(warps_per_block *config::warp_size) generate( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, ValueType *__restrict__ block_data, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType *__restrict__ block_ptrs, size_type num_blocks) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto block = group::this_thread_block(); + ValueType row[max_block_size]; + __shared__ UninitializedArray + workspace; + csr::extract_transposed_diag_blocks( + block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values, + block_ptrs, num_blocks, row, 1, + workspace + threadIdx.z * max_block_size); + const auto subwarp = group::tiled_partition(block); + if (block_id < num_blocks) { + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + auto perm = subwarp.thread_rank(); + auto trans_perm = subwarp.thread_rank(); + invert_block(subwarp, block_size, row, perm, + trans_perm); + copy_matrix( + subwarp, block_size, row, 1, perm, trans_perm, + block_data + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); + } +} + + +template +__global__ void +__launch_bounds__(warps_per_block *config::warp_size) adaptive_generate( + size_type num_rows, const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idxs, + const ValueType *__restrict__ values, remove_complex accuracy, + ValueType *__restrict__ block_data, + preconditioner::block_interleaved_storage_scheme storage_scheme, + remove_complex *__restrict__ conditioning, + precision_reduction *__restrict__ block_precisions, + const IndexType *__restrict__ block_ptrs, size_type num_blocks) +{ + // extract blocks + const auto block_id = + thread::get_subwarp_id(); + const auto block = group::this_thread_block(); + ValueType row[max_block_size]; + __shared__ UninitializedArray + workspace; + csr::extract_transposed_diag_blocks( + block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values, + block_ptrs, num_blocks, row, 1, + workspace + threadIdx.z * max_block_size); + + // compute inverse and figure out the correct precision + const auto subwarp = group::tiled_partition(block); + const auto block_size = + block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id] + : 0; + auto perm = subwarp.thread_rank(); + auto trans_perm = subwarp.thread_rank(); + auto prec_descriptor = ~uint32{}; + if (block_id < num_blocks) { + auto block_cond = compute_infinity_norm( + subwarp, block_size, block_size, row); + invert_block(subwarp, block_size, row, perm, + trans_perm); + block_cond *= compute_infinity_norm(subwarp, block_size, + block_size, row); + conditioning[block_id] = block_cond; + const auto prec = block_precisions[block_id]; + prec_descriptor = + preconditioner::detail::precision_reduction_descriptor::singleton( + prec); + if (prec == precision_reduction::autodetect()) { + using preconditioner::detail::get_supported_storage_reductions; + prec_descriptor = get_supported_storage_reductions( + accuracy, block_cond, + [&subwarp, &block_size, &row, &block_data, &storage_scheme, + &block_id] { + using target = reduce_precision; + return validate_precision_reduction_feasibility< + max_block_size, target>( + subwarp, block_size, row, + block_data + + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); + }, + [&subwarp, &block_size, &row, &block_data, &storage_scheme, + &block_id] { + using target = + reduce_precision>; + return validate_precision_reduction_feasibility< + max_block_size, target>( + subwarp, block_size, row, + block_data + + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); + }); + } + } + + // make sure all blocks in the group have the same precision + const auto warp = group::tiled_partition(block); + const auto prec = + preconditioner::detail::get_optimal_storage_reduction(reduce( + warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; })); + + // store the block back into memory + if (block_id < num_blocks) { + block_precisions[block_id] = prec; + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, prec, + copy_matrix( + subwarp, block_size, row, 1, perm, trans_perm, + reinterpret_cast( + block_data + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id), + storage_scheme.get_stride())); + } +} + + +} // namespace kernel diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc new file mode 100644 index 00000000000..d480a0a154a --- /dev/null +++ b/common/preconditioner/jacobi_kernels.hpp.inc @@ -0,0 +1,215 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ +__launch_bounds__(warps_per_block *config::warp_size) void duplicate_array( + const precision_reduction *__restrict__ source, size_type source_size, + precision_reduction *__restrict__ dest, size_type dest_size) +{ + auto grid = group::this_grid(); + if (grid.thread_rank() >= dest_size) { + return; + } + for (auto i = grid.thread_rank(); i < dest_size; i += grid.size()) { + dest[i] = source[i % source_size]; + } +} + + +template +__global__ void compare_adjacent_rows(size_type num_rows, int32 max_block_size, + const IndexType *__restrict__ row_ptrs, + const IndexType *__restrict__ col_idx, + bool *__restrict__ matching_next_row) +{ + const auto warp = + group::tiled_partition(group::this_thread_block()); + const auto local_tid = warp.thread_rank(); + const auto warp_id = thread::get_subwarp_id_flat(); + + if (warp_id >= num_rows - 1) { + return; + } + + const auto curr_row_start = row_ptrs[warp_id]; + const auto next_row_start = row_ptrs[warp_id + 1]; + const auto next_row_end = row_ptrs[warp_id + 2]; + + const auto nz_this_row = next_row_end - next_row_start; + const auto nz_prev_row = next_row_start - curr_row_start; + + if (nz_this_row != nz_prev_row) { + matching_next_row[warp_id] = false; + return; + } + size_type steps = ceildiv(nz_this_row, config::warp_size); + for (size_type i = 0; i < steps; i++) { + auto j = local_tid + i * config::warp_size; + auto prev_col = (curr_row_start + j < next_row_start) + ? col_idx[curr_row_start + j] + : 0; + auto this_col = (curr_row_start + j < next_row_start) + ? col_idx[next_row_start + j] + : 0; + if (warp.any(prev_col != this_col)) { + matching_next_row[warp_id] = false; + return; + } + } + matching_next_row[warp_id] = true; +} + + +template +__global__ void generate_natural_block_pointer( + size_type num_rows, int32 max_block_size, + const bool *__restrict__ matching_next_row, + IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr) +{ + block_ptrs[0] = 0; + if (num_rows == 0) { + return; + } + size_type num_blocks = 1; + int32 current_block_size = 1; + for (size_type i = 0; i < num_rows - 1; ++i) { + if ((matching_next_row[i]) && (current_block_size < max_block_size)) { + ++current_block_size; + } else { + block_ptrs[num_blocks] = + block_ptrs[num_blocks - 1] + current_block_size; + ++num_blocks; + current_block_size = 1; + } + } + block_ptrs[num_blocks] = block_ptrs[num_blocks - 1] + current_block_size; + num_blocks_arr[0] = num_blocks; +} + + +template +__global__ void agglomerate_supervariables_kernel( + int32 max_block_size, size_type num_natural_blocks, + IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr) +{ + num_blocks_arr[0] = 0; + if (num_natural_blocks == 0) { + return; + } + size_type num_blocks = 1; + int32 current_block_size = block_ptrs[1] - block_ptrs[0]; + for (size_type i = 1; i < num_natural_blocks; ++i) { + const int32 block_size = block_ptrs[i + 1] - block_ptrs[i]; + if (current_block_size + block_size <= max_block_size) { + current_block_size += block_size; + } else { + block_ptrs[num_blocks] = block_ptrs[i]; + ++num_blocks; + current_block_size = block_size; + } + } + block_ptrs[num_blocks] = block_ptrs[num_natural_blocks]; + num_blocks_arr[0] = num_blocks; +} + + +template +__global__ void __launch_bounds__(warps_per_block *config::warp_size) + transpose_jacobi(const ValueType *__restrict__ blocks, + preconditioner::block_interleaved_storage_scheme + storage_scheme, + const IndexType *__restrict__ block_ptrs, + size_type num_blocks, ValueType *__restrict__ out_blocks) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + + const auto block_ofs = storage_scheme.get_global_block_offset(block_id); + const auto block_stride = storage_scheme.get_stride(); + const auto rank = subwarp.thread_rank(); + if (rank < block_size) { + for (IndexType i = 0; i < block_size; ++i) { + auto val = blocks[block_ofs + i * block_stride + rank]; + out_blocks[block_ofs + i + rank * block_stride] = + conjugate ? conj(val) : val; + } + } +} + + +template +__global__ void +__launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi( + const ValueType *__restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction *__restrict__ block_precisions, + const IndexType *__restrict__ block_ptrs, size_type num_blocks, + ValueType *__restrict__ out_blocks) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + + const auto block_stride = storage_scheme.get_stride(); + const auto rank = subwarp.thread_rank(); + if (rank < block_size) { + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + auto local_block = + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + auto local_out_block = + reinterpret_cast( + out_blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + for (IndexType i = 0; i < block_size; ++i) { + auto val = local_block[i * block_stride + rank]; + local_out_block[i + rank * block_stride] = + conjugate ? conj(val) : val; + }); + } +} diff --git a/common/preconditioner/jacobi_simple_apply_kernel.hpp.inc b/common/preconditioner/jacobi_simple_apply_kernel.hpp.inc new file mode 100644 index 00000000000..c7a472bd409 --- /dev/null +++ b/common/preconditioner/jacobi_simple_apply_kernel.hpp.inc @@ -0,0 +1,104 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ void __launch_bounds__(warps_per_block *config::warp_size) apply( + const ValueType *__restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType *__restrict__ block_ptrs, size_type num_blocks, + const ValueType *__restrict__ b, int32 b_stride, ValueType *__restrict__ x, + int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + multiply_vec( + subwarp, block_size, v, + blocks + storage_scheme.get_global_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType &result, const ValueType &out) { result = out; }); +} + + +template +__global__ void __launch_bounds__(warps_per_block *config::warp_size) + adaptive_apply(const ValueType *__restrict__ blocks, + preconditioner::block_interleaved_storage_scheme + storage_scheme, + const precision_reduction *__restrict__ block_precisions, + const IndexType *__restrict__ block_ptrs, + size_type num_blocks, const ValueType *__restrict__ b, + int32 b_stride, ValueType *__restrict__ x, int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + multiply_vec( + subwarp, block_size, v, + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType &result, const ValueType &out) { result = out; })); +} + + +} // namespace kernel diff --git a/common/solver/bicg_kernels.hpp.inc b/common/solver/bicg_kernels.hpp.inc new file mode 100644 index 00000000000..fdb8ee8f3f9 --- /dev/null +++ b/common/solver/bicg_kernels.hpp.inc @@ -0,0 +1,111 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void initialize_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ b, ValueType *__restrict__ r, + ValueType *__restrict__ z, ValueType *__restrict__ p, + ValueType *__restrict__ q, ValueType *__restrict__ r2, + ValueType *__restrict__ z2, ValueType *__restrict__ p2, + ValueType *__restrict__ q2, ValueType *__restrict__ prev_rho, + ValueType *__restrict__ rho, stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_cols) { + rho[tidx] = zero(); + prev_rho[tidx] = one(); + stop_status[tidx].reset(); + } + + if (tidx < num_rows * stride) { + r[tidx] = b[tidx]; + z[tidx] = zero(); + p[tidx] = zero(); + q[tidx] = zero(); + r2[tidx] = b[tidx]; + z2[tidx] = zero(); + p2[tidx] = zero(); + q2[tidx] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ p, const ValueType *__restrict__ z, + ValueType *__restrict__ p2, const ValueType *__restrict__ z2, + const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + const auto tmp = rho[col] / prev_rho[col]; + + p[tidx] = + prev_rho[col] == zero() ? z[tidx] : z[tidx] + tmp * p[tidx]; + + p2[tidx] = prev_rho[col] == zero() ? z2[tidx] + : z2[tidx] + tmp * p2[tidx]; +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type num_rows, size_type num_cols, size_type stride, + size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, + ValueType *__restrict__ r2, const ValueType *__restrict__ p, + const ValueType *__restrict__ q, const ValueType *__restrict__ q2, + const ValueType *__restrict__ beta, const ValueType *__restrict__ rho, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / stride; + const auto col = tidx % stride; + + if (col >= num_cols || tidx >= num_rows * num_cols || + stop_status[col].has_stopped()) { + return; + } + if (beta[col] != zero()) { + const auto tmp = rho[col] / beta[col]; + x[row * x_stride + col] += tmp * p[tidx]; + r[tidx] -= tmp * q[tidx]; + r2[tidx] -= tmp * q2[tidx]; + } +} diff --git a/common/solver/bicgstab_kernels.hpp.inc b/common/solver/bicgstab_kernels.hpp.inc new file mode 100644 index 00000000000..03071970fcc --- /dev/null +++ b/common/solver/bicgstab_kernels.hpp.inc @@ -0,0 +1,168 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void initialize_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ b, ValueType *__restrict__ r, + ValueType *__restrict__ rr, ValueType *__restrict__ y, + ValueType *__restrict__ s, ValueType *__restrict__ t, + ValueType *__restrict__ z, ValueType *__restrict__ v, + ValueType *__restrict__ p, ValueType *__restrict__ prev_rho, + ValueType *__restrict__ rho, ValueType *__restrict__ alpha, + ValueType *__restrict__ beta, ValueType *__restrict__ gamma, + ValueType *__restrict__ omega, stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_cols) { + prev_rho[tidx] = one(); + rho[tidx] = one(); + alpha[tidx] = one(); + beta[tidx] = one(); + gamma[tidx] = one(); + omega[tidx] = one(); + stop_status[tidx].reset(); + } + + if (tidx < num_rows * stride) { + r[tidx] = b[tidx]; + rr[tidx] = zero(); + y[tidx] = zero(); + s[tidx] = zero(); + t[tidx] = zero(); + z[tidx] = zero(); + v[tidx] = zero(); + p[tidx] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ r, ValueType *__restrict__ p, + const ValueType *__restrict__ v, const ValueType *__restrict__ rho, + const ValueType *__restrict__ prev_rho, const ValueType *__restrict__ alpha, + const ValueType *__restrict__ omega, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + auto res = r[tidx]; + if (prev_rho[col] * omega[col] != zero()) { + const auto tmp = (rho[col] / prev_rho[col]) * (alpha[col] / omega[col]); + res += tmp * (p[tidx] - omega[col] * v[tidx]); + } + p[tidx] = res; +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ r, ValueType *__restrict__ s, + const ValueType *__restrict__ v, const ValueType *__restrict__ rho, + ValueType *__restrict__ alpha, const ValueType *__restrict__ beta, + const stopping_status *__restrict__ stop_status) +{ + const size_type tidx = thread::get_thread_id_flat(); + const size_type col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + auto t_alpha = zero(); + auto t_s = r[tidx]; + if (beta[col] != zero()) { + t_alpha = rho[col] / beta[col]; + t_s -= t_alpha * v[tidx]; + } + alpha[col] = t_alpha; + s[tidx] = t_s; +} + + +template +__global__ __launch_bounds__(default_block_size) void step_3_kernel( + size_type num_rows, size_type num_cols, size_type stride, + size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, + const ValueType *__restrict__ s, const ValueType *__restrict__ t, + const ValueType *__restrict__ y, const ValueType *__restrict__ z, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ beta, + const ValueType *__restrict__ gamma, ValueType *__restrict__ omega, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / stride; + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + const auto x_pos = row * x_stride + col; + auto t_omega = zero(); + auto t_x = x[x_pos] + alpha[col] * y[tidx]; + auto t_r = s[tidx]; + if (beta[col] != zero()) { + t_omega = gamma[col] / beta[col]; + t_x += t_omega * z[tidx]; + t_r -= t_omega * t[tidx]; + } + omega[col] = t_omega; + x[x_pos] = t_x; + r[tidx] = t_r; +} + + +template +__global__ __launch_bounds__(default_block_size) void finalize_kernel( + size_type num_rows, size_type num_cols, size_type stride, + size_type x_stride, ValueType *__restrict__ x, + const ValueType *__restrict__ y, const ValueType *__restrict__ alpha, + stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / stride; + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].is_finalized() || !stop_status[col].has_stopped()) { + return; + } + const auto x_pos = row * x_stride + col; + x[x_pos] = x[x_pos] + alpha[col] * y[tidx]; + stop_status[col].finalize(); +} diff --git a/common/solver/cg_kernels.hpp.inc b/common/solver/cg_kernels.hpp.inc new file mode 100644 index 00000000000..d318c30f338 --- /dev/null +++ b/common/solver/cg_kernels.hpp.inc @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void initialize_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ b, ValueType *__restrict__ r, + ValueType *__restrict__ z, ValueType *__restrict__ p, + ValueType *__restrict__ q, ValueType *__restrict__ prev_rho, + ValueType *__restrict__ rho, stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_cols) { + rho[tidx] = zero(); + prev_rho[tidx] = one(); + stop_status[tidx].reset(); + } + + if (tidx < num_rows * stride) { + r[tidx] = b[tidx]; + z[tidx] = zero(); + p[tidx] = zero(); + q[tidx] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ p, const ValueType *__restrict__ z, + const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + const auto tmp = rho[col] / prev_rho[col]; + p[tidx] = + prev_rho[col] == zero() ? z[tidx] : z[tidx] + tmp * p[tidx]; +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type num_rows, size_type num_cols, size_type stride, + size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, + const ValueType *__restrict__ p, const ValueType *__restrict__ q, + const ValueType *__restrict__ beta, const ValueType *__restrict__ rho, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / stride; + const auto col = tidx % stride; + + if (col >= num_cols || tidx >= num_rows * num_cols || + stop_status[col].has_stopped()) { + return; + } + if (beta[col] != zero()) { + const auto tmp = rho[col] / beta[col]; + x[row * x_stride + col] += tmp * p[tidx]; + r[tidx] -= tmp * q[tidx]; + } +} diff --git a/common/solver/cgs_kernels.hpp.inc b/common/solver/cgs_kernels.hpp.inc new file mode 100644 index 00000000000..d6c3e64cd4c --- /dev/null +++ b/common/solver/cgs_kernels.hpp.inc @@ -0,0 +1,137 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void initialize_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ b, ValueType *__restrict__ r, + ValueType *__restrict__ r_tld, ValueType *__restrict__ p, + ValueType *__restrict__ q, ValueType *__restrict__ u, + ValueType *__restrict__ u_hat, ValueType *__restrict__ v_hat, + ValueType *__restrict__ t, ValueType *__restrict__ alpha, + ValueType *__restrict__ beta, ValueType *__restrict__ gamma, + ValueType *__restrict__ rho_prev, ValueType *__restrict__ rho, + stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_cols) { + rho[tidx] = zero(); + alpha[tidx] = one(); + beta[tidx] = one(); + gamma[tidx] = one(); + rho_prev[tidx] = one(); + stop_status[tidx].reset(); + } + + if (tidx < num_rows * stride) { + r[tidx] = b[tidx]; + r_tld[tidx] = b[tidx]; + u[tidx] = zero(); + p[tidx] = zero(); + q[tidx] = zero(); + u_hat[tidx] = zero(); + v_hat[tidx] = zero(); + t[tidx] = zero(); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ r, ValueType *__restrict__ u, + ValueType *__restrict__ p, const ValueType *__restrict__ q, + ValueType *__restrict__ beta, const ValueType *__restrict__ rho, + const ValueType *__restrict__ rho_prev, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto col = tidx % stride; + + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + if (rho_prev[col] != zero()) { + beta[col] = rho[col] / rho_prev[col]; + u[tidx] = r[tidx] + beta[col] * q[tidx]; + p[tidx] = u[tidx] + beta[col] * (q[tidx] + beta[col] * p[tidx]); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ u, const ValueType *__restrict__ v_hat, + ValueType *__restrict__ q, ValueType *__restrict__ t, + ValueType *__restrict__ alpha, const ValueType *__restrict__ rho, + const ValueType *__restrict__ gamma, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto col = tidx % stride; + + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + if (gamma[col] != zero()) { + alpha[col] = rho[col] / gamma[col]; + q[tidx] = u[tidx] - alpha[col] * v_hat[tidx]; + t[tidx] = u[tidx] + q[tidx]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_3_kernel( + size_type num_rows, size_type num_cols, size_type stride, + size_type x_stride, const ValueType *__restrict__ t, + const ValueType *__restrict__ v_hat, ValueType *__restrict__ r, + ValueType *__restrict__ x, const ValueType *__restrict__ alpha, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / stride; + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + const auto x_pos = row * x_stride + col; + auto t_x = x[x_pos] + alpha[col] * v_hat[tidx]; + auto t_r = r[tidx] - alpha[col] * t[tidx]; + x[x_pos] = t_x; + r[tidx] = t_r; +} \ No newline at end of file diff --git a/common/solver/fcg_kernels.hpp.inc b/common/solver/fcg_kernels.hpp.inc new file mode 100644 index 00000000000..2b5b72029a2 --- /dev/null +++ b/common/solver/fcg_kernels.hpp.inc @@ -0,0 +1,104 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void initialize_kernel( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ b, ValueType *__restrict__ r, + ValueType *__restrict__ z, ValueType *__restrict__ p, + ValueType *__restrict__ q, ValueType *__restrict__ t, + ValueType *__restrict__ prev_rho, ValueType *__restrict__ rho, + ValueType *__restrict__ rho_t, stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_cols) { + rho[tidx] = zero(); + prev_rho[tidx] = one(); + rho_t[tidx] = one(); + stop_status[tidx].reset(); + } + + if (tidx < num_rows * stride) { + r[tidx] = b[tidx]; + z[tidx] = zero(); + p[tidx] = zero(); + q[tidx] = zero(); + t[tidx] = b[tidx]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ p, const ValueType *__restrict__ z, + const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto col = tidx % stride; + if (col >= num_cols || tidx >= num_rows * stride || + stop_status[col].has_stopped()) { + return; + } + const auto tmp = rho[col] / prev_rho[col]; + p[tidx] = + prev_rho[col] == zero() ? z[tidx] : z[tidx] + tmp * p[tidx]; +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type num_rows, size_type num_cols, size_type stride, + size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, + ValueType *__restrict__ t, const ValueType *__restrict__ p, + const ValueType *__restrict__ q, const ValueType *__restrict__ beta, + const ValueType *__restrict__ rho, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / stride; + const auto col = tidx % stride; + + if (col >= num_cols || tidx >= num_rows * num_cols || + stop_status[col].has_stopped()) { + return; + } + if (beta[col] != zero()) { + const auto tmp = rho[col] / beta[col]; + const auto prev_r = r[tidx]; + x[row * x_stride + col] += tmp * p[tidx]; + r[tidx] -= tmp * q[tidx]; + t[tidx] = r[tidx] - prev_r; + } +} \ No newline at end of file diff --git a/common/solver/gmres_kernels.hpp.inc b/common/solver/gmres_kernels.hpp.inc new file mode 100644 index 00000000000..7b991879571 --- /dev/null +++ b/common/solver/gmres_kernels.hpp.inc @@ -0,0 +1,405 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// Must be called with at least `max(stride_b * num_rows, krylov_dim * +// num_cols)` threads in total. +template +__global__ __launch_bounds__(block_size) void initialize_1_kernel( + size_type num_rows, size_type num_cols, size_type krylov_dim, + const ValueType *__restrict__ b, size_type stride_b, + ValueType *__restrict__ residual, size_type stride_residual, + ValueType *__restrict__ givens_sin, size_type stride_sin, + ValueType *__restrict__ givens_cos, size_type stride_cos, + stopping_status *__restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + + const auto row_idx = global_id / stride_b; + const auto col_idx = global_id % stride_b; + + if (global_id < num_cols) { + stop_status[global_id].reset(); + } + + if (row_idx < num_rows && col_idx < num_cols) { + residual[row_idx * stride_residual + col_idx] = + b[row_idx * stride_b + col_idx]; + } + + if (global_id < krylov_dim * num_cols) { + const auto row_givens = global_id / num_cols; + const auto col_givens = global_id % num_cols; + + givens_sin[row_givens * stride_sin + col_givens] = zero(); + givens_cos[row_givens * stride_cos + col_givens] = zero(); + } +} + + +// Must be called with at least `num_rows * num_rhs` threads in total. +template +__global__ __launch_bounds__(block_size) void initialize_2_2_kernel( + size_type num_rows, size_type num_rhs, + const ValueType *__restrict__ residual, size_type stride_residual, + const remove_complex *__restrict__ residual_norm, + ValueType *__restrict__ residual_norm_collection, + ValueType *__restrict__ krylov_bases, size_type stride_krylov, + size_type *__restrict__ final_iter_nums) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row_idx = global_id / num_rhs; + const auto col_idx = global_id % num_rhs; + + if (global_id < num_rhs) { + residual_norm_collection[global_id] = residual_norm[global_id]; + final_iter_nums[global_id] = 0; + } + + if (row_idx < num_rows && col_idx < num_rhs) { + auto value = residual[row_idx * stride_residual + col_idx] / + residual_norm[col_idx]; + krylov_bases[row_idx * stride_krylov + col_idx] = value; + } +} + + +__global__ + __launch_bounds__(default_block_size) void increase_final_iteration_numbers_kernel( + size_type *__restrict__ final_iter_nums, + const stopping_status *__restrict__ stop_status, size_type total_number) +{ + const auto global_id = thread::get_thread_id_flat(); + if (global_id < total_number) { + final_iter_nums[global_id] += !stop_status[global_id].has_stopped(); + } +} + + +template +__global__ __launch_bounds__(default_dot_size) void multidot_kernel( + size_type k, size_type num_rows, size_type num_cols, + const ValueType *__restrict__ krylov_bases, + const ValueType *__restrict__ next_krylov_basis, size_type stride_krylov, + ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = threadIdx.x; + const auto tidy = threadIdx.y; + const auto col_idx = blockIdx.x * default_dot_dim + tidx; + const auto num = ceildiv(num_rows, gridDim.y); + const auto start_row = blockIdx.y * num; + const auto end_row = + ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; + // Used that way to get around dynamic initialization warning and + // template error when using `reduction_helper_array` directly in `reduce` + __shared__ + UninitializedArray + reduction_helper_array; + ValueType *__restrict__ reduction_helper = reduction_helper_array; + + ValueType local_res = zero(); + if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { + for (size_type i = start_row + tidy; i < end_row; + i += default_dot_dim) { + const auto krylov_idx = i * stride_krylov + col_idx; + local_res += + conj(krylov_bases[krylov_idx]) * next_krylov_basis[krylov_idx]; + } + } + reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; + __syncthreads(); + local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + const auto sum = + reduce(tile_block, local_res, + [](const ValueType &a, const ValueType &b) { return a + b; }); + const auto new_col_idx = blockIdx.x * default_dot_dim + tidy; + if (tidx == 0 && new_col_idx < num_cols && + !stop_status[new_col_idx].has_stopped()) { + const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; + atomic_add(hessenberg_iter + hessenberg_idx, sum); + } +} + + +// Must be called with at least `num_rows * stride_next_krylov` threads in +// total. +template +__global__ __launch_bounds__(block_size) void update_next_krylov_kernel( + size_type k, size_type num_rows, size_type num_cols, + const ValueType *__restrict__ krylov_bases, + ValueType *__restrict__ next_krylov_basis, size_type stride_krylov, + const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, + const stopping_status *__restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row_idx = global_id / stride_krylov; + const auto col_idx = global_id % stride_krylov; + + if (row_idx < num_rows && col_idx < num_cols && + !stop_status[col_idx].has_stopped()) { + const auto next_krylov_idx = row_idx * stride_krylov + col_idx; + const auto krylov_idx = row_idx * stride_krylov + col_idx; + const auto hessenberg_idx = k * stride_hessenberg + col_idx; + + next_krylov_basis[next_krylov_idx] -= + hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx]; + } +} + + +// Must be called with at least `num_cols` blocks, each with `block_size` +// threads. `block_size` must be a power of 2. +template +__global__ __launch_bounds__(block_size) void update_hessenberg_2_kernel( + size_type iter, size_type num_rows, size_type num_cols, + const ValueType *__restrict__ next_krylov_basis, + size_type stride_next_krylov, ValueType *__restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status *__restrict__ stop_status) +{ + const auto tidx = threadIdx.x; + const auto col_idx = blockIdx.x; + + // Used that way to get around dynamic initialization warning and + // template error when using `reduction_helper_array` directly in `reduce` + __shared__ UninitializedArray reduction_helper_array; + ValueType *__restrict__ reduction_helper = reduction_helper_array; + + if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { + ValueType local_res{}; + for (size_type i = tidx; i < num_rows; i += block_size) { + const auto next_krylov_idx = i * stride_next_krylov + col_idx; + const auto next_krylov_value = next_krylov_basis[next_krylov_idx]; + + local_res += next_krylov_value * next_krylov_value; + } + + reduction_helper[tidx] = local_res; + + // Perform thread block reduction. Result is in reduction_helper[0] + reduce(group::this_thread_block(), reduction_helper, + [](const ValueType &a, const ValueType &b) { return a + b; }); + + if (tidx == 0) { + hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] = + sqrt(reduction_helper[0]); + } + } +} + + +// Must be called with at least `num_rows * stride_krylov` threads in +// total. +template +__global__ __launch_bounds__(block_size) void update_krylov_kernel( + size_type iter, size_type num_rows, size_type num_cols, + ValueType *__restrict__ krylov_bases, size_type stride_krylov, + const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, + const stopping_status *__restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row_idx = global_id / stride_krylov; + const auto col_idx = global_id % stride_krylov; + const auto hessenberg = + hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]; + + if (row_idx < num_rows && col_idx < num_cols && + !stop_status[col_idx].has_stopped()) { + const auto krylov_idx = row_idx * stride_krylov + col_idx; + + krylov_bases[krylov_idx] /= hessenberg; + } +} + + +template +__device__ void calculate_sin_and_cos_kernel( + size_type col_idx, size_type num_cols, size_type iter, + const ValueType &this_hess, const ValueType &next_hess, + ValueType *givens_sin, size_type stride_sin, ValueType *givens_cos, + size_type stride_cos, ValueType ®ister_sin, ValueType ®ister_cos) +{ + if (this_hess == zero()) { + register_cos = zero(); + register_sin = one(); + } else { + const auto scale = abs(this_hess) + abs(next_hess); + const auto hypotenuse = + scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + + abs(next_hess / scale) * abs(next_hess / scale)); + register_cos = conj(this_hess) / hypotenuse; + register_sin = conj(next_hess) / hypotenuse; + } + givens_cos[iter * stride_cos + col_idx] = register_cos; + givens_sin[iter * stride_sin + col_idx] = register_sin; +} + + +template +__device__ void calculate_residual_norm_kernel( + size_type col_idx, size_type num_cols, size_type iter, + const ValueType ®ister_sin, const ValueType ®ister_cos, + remove_complex *residual_norm, + ValueType *residual_norm_collection, + size_type stride_residual_norm_collection) +{ + const auto this_rnc = + residual_norm_collection[iter * stride_residual_norm_collection + + col_idx]; + const auto next_rnc = -conj(register_sin) * this_rnc; + residual_norm_collection[iter * stride_residual_norm_collection + col_idx] = + register_cos * this_rnc; + residual_norm[col_idx] = abs(next_rnc); + residual_norm_collection[(iter + 1) * stride_residual_norm_collection + + col_idx] = next_rnc; +} + + +// Must be called with at least `num_cols` threads in total. +template +__global__ __launch_bounds__(block_size) void givens_rotation_kernel( + size_type num_rows, size_type num_cols, size_type iter, + ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, + ValueType *__restrict__ givens_sin, size_type stride_sin, + ValueType *__restrict__ givens_cos, size_type stride_cos, + remove_complex *__restrict__ residual_norm, + ValueType *__restrict__ residual_norm_collection, + size_type stride_residual_norm_collection, + const stopping_status *__restrict__ stop_status) +{ + const auto col_idx = thread::get_thread_id_flat(); + + if (col_idx >= num_cols || stop_status[col_idx].has_stopped()) { + return; + } + + auto this_hess = hessenberg_iter[col_idx]; + auto next_hess = hessenberg_iter[stride_hessenberg + col_idx]; + for (size_type i = 0; i < iter; ++i) { + const auto cos = givens_cos[i * stride_cos + col_idx]; + const auto sin = givens_sin[i * stride_sin + col_idx]; + hessenberg_iter[i * stride_hessenberg + col_idx] = + cos * this_hess + sin * next_hess; + this_hess = conj(cos) * next_hess - conj(sin) * this_hess; + next_hess = hessenberg_iter[(i + 2) * stride_hessenberg + col_idx]; + } + // for j in 0:iter - 1 + // temp = cos(j)*hessenberg(j) + + // sin(j)*hessenberg(j+1) + // hessenberg(j+1) = -sin(j)*hessenberg(j) + + // cos(j)*hessenberg(j+1) + // hessenberg(j) = temp; + // end + + ValueType register_sin; + ValueType register_cos; + calculate_sin_and_cos_kernel(col_idx, num_cols, iter, this_hess, next_hess, + givens_sin, stride_sin, givens_cos, stride_cos, + register_sin, register_cos); + // Calculate sin and cos on hessenberg(iter) and hessenberg(iter+1) + + hessenberg_iter[iter * stride_hessenberg + col_idx] = + register_cos * this_hess + register_sin * next_hess; + hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] = + zero(); + // hessenberg(iter) = cos(iter)*hessenberg(iter) + + // sin(iter)*hessenberg(iter+1) + // hessenberg(iter+1) = 0 + + calculate_residual_norm_kernel( + col_idx, num_cols, iter, register_sin, register_cos, residual_norm, + residual_norm_collection, stride_residual_norm_collection); + // Calculate residual norm +} + + +// Must be called with at least `num_rhs` threads in total. +template +__global__ __launch_bounds__(block_size) void solve_upper_triangular_kernel( + size_type num_cols, size_type num_rhs, + const ValueType *__restrict__ residual_norm_collection, + size_type stride_residual_norm_collection, + const ValueType *__restrict__ hessenberg, size_type stride_hessenberg, + ValueType *__restrict__ y, size_type stride_y, + const size_type *__restrict__ final_iter_nums) +{ + const auto col_idx = thread::get_thread_id_flat(); + + if (col_idx >= num_rhs) { + return; + } + + for (int i = final_iter_nums[col_idx] - 1; i >= 0; --i) { + auto temp = + residual_norm_collection[i * stride_residual_norm_collection + + col_idx]; + for (size_type j = i + 1; j < final_iter_nums[col_idx]; ++j) { + temp -= hessenberg[i * stride_hessenberg + j * num_rhs + col_idx] * + y[j * stride_y + col_idx]; + } + + y[i * stride_y + col_idx] = + temp / hessenberg[i * stride_hessenberg + i * num_rhs + col_idx]; + } + // Solve upper triangular. + // y = hessenberg \ residual_norm_collection +} + + +// Must be called with at least `stride_preconditioner * num_rows` threads in +// total. +template +__global__ __launch_bounds__(block_size) void calculate_Qy_kernel( + size_type num_rows, size_type num_cols, size_type num_rhs, + const ValueType *__restrict__ krylov_bases, size_type stride_krylov, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ before_preconditioner, + size_type stride_preconditioner, + const size_type *__restrict__ final_iter_nums) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row_id = global_id / stride_preconditioner; + const auto col_id = global_id % stride_preconditioner; + + if (row_id < num_rows && col_id < num_cols) { + ValueType temp = zero(); + + for (size_type j = 0; j < final_iter_nums[col_id]; ++j) { + temp += + krylov_bases[(row_id + j * num_rows) * stride_krylov + col_id] * + y[j * stride_y + col_id]; + } + before_preconditioner[global_id] = temp; + } +} diff --git a/common/solver/ir_kernels.hpp.inc b/common/solver/ir_kernels.hpp.inc new file mode 100644 index 00000000000..24a66f2795b --- /dev/null +++ b/common/solver/ir_kernels.hpp.inc @@ -0,0 +1,41 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +__global__ __launch_bounds__(default_block_size) void initialize_kernel( + size_type num_cols, stopping_status *stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_cols) { + stop_status[tidx].reset(); + } +} \ No newline at end of file diff --git a/contributors.txt b/contributors.txt index 22856fbdb9f..fd97439ad7f 100644 --- a/contributors.txt +++ b/contributors.txt @@ -8,10 +8,12 @@ Cojean Terry Karlsruhe Institute of Technology Drzaic Jelena University of Zagreb Flegar Goran Universitat Jaume I Göbel Fritz Karlsruhe Institute of Technology +Grötzinger Dennis Karlsruhe Institute of Technology Grützmacher Thomas Karlsruhe Institute of Technology Heroux Mike Sandia National Laboratories Hoemmen Mark Sandia National Laboratories Holeksa Claudius Karlsruhe Institute of Technology +Maier Matthias Texas A&M University Nayak Pratik Karlsruhe Institute of Technology Ribizel Tobias Karlsruhe Institute of Technology Tsai Yuhsiang National Taiwan University diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index a976e362d97..036f6f1fe19 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -4,39 +4,46 @@ add_subdirectory(device_hooks) # placeholders for disabled modules add_library(ginkgo "") target_sources(ginkgo PRIVATE - base/combination.cpp - base/composition.cpp - base/executor.cpp - base/mtx_io.cpp - base/perturbation.cpp - base/version.cpp - factorization/par_ilu.cpp - log/convergence.cpp - log/logger.cpp - log/record.cpp - log/stream.cpp - matrix/coo.cpp - matrix/csr.cpp - matrix/dense.cpp - matrix/ell.cpp - matrix/hybrid.cpp - matrix/identity.cpp - matrix/sellp.cpp - matrix/sparsity_csr.cpp - preconditioner/jacobi.cpp - solver/bicgstab.cpp - solver/cg.cpp - solver/cgs.cpp - solver/fcg.cpp - solver/gmres.cpp - solver/ir.cpp - solver/lower_trs.cpp - solver/upper_trs.cpp - stop/combined.cpp - stop/criterion.cpp - stop/iteration.cpp - stop/residual_norm_reduction.cpp - stop/time.cpp) + base/array.cpp + base/combination.cpp + base/composition.cpp + base/executor.cpp + base/mtx_io.cpp + base/perturbation.cpp + base/version.cpp + factorization/ilu.cpp + factorization/par_ict.cpp + factorization/par_ilu.cpp + factorization/par_ilut.cpp + log/convergence.cpp + log/logger.cpp + log/record.cpp + log/stream.cpp + matrix/coo.cpp + matrix/csr.cpp + matrix/dense.cpp + matrix/ell.cpp + matrix/hybrid.cpp + matrix/identity.cpp + matrix/permutation.cpp + matrix/sellp.cpp + matrix/sparsity_csr.cpp + preconditioner/isai.cpp + preconditioner/jacobi.cpp + solver/bicg.cpp + solver/bicgstab.cpp + solver/cg.cpp + solver/cgs.cpp + solver/fcg.cpp + solver/gmres.cpp + solver/ir.cpp + solver/lower_trs.cpp + solver/upper_trs.cpp + stop/combined.cpp + stop/criterion.cpp + stop/iteration.cpp + stop/residual_norm.cpp + stop/time.cpp) if(GINKGO_HAVE_PAPI_SDE) target_sources(ginkgo PRIVATE log/papi.cpp) @@ -49,14 +56,18 @@ target_compile_options(ginkgo PRIVATE "${GINKGO_COMPILER_FLAGS}") # regardless of whether it is installed or added as a subdirectory add_library(Ginkgo::ginkgo ALIAS ginkgo) target_link_libraries(ginkgo - PUBLIC ginkgo_omp ginkgo_cuda ginkgo_reference) + PUBLIC ginkgo_omp ginkgo_cuda ginkgo_reference ginkgo_hip) +# The PAPI dependency needs to be exposed to the user. if (GINKGO_HAVE_PAPI_SDE) - target_link_libraries(ginkgo PRIVATE PAPI::PAPI) + target_link_libraries(ginkgo PUBLIC PAPI::PAPI) endif() ginkgo_default_includes(ginkgo) ginkgo_install_library(ginkgo core) +if (GINKGO_CHECK_CIRCULAR_DEPS) + ginkgo_check_headers(ginkgo) +endif() + if(GINKGO_BUILD_TESTS) add_subdirectory(test) endif() - diff --git a/core/base/allocator.hpp b/core/base/allocator.hpp new file mode 100644 index 00000000000..0c62f5deccb --- /dev/null +++ b/core/base/allocator.hpp @@ -0,0 +1,175 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_ALLOCATOR_HPP_ +#define GKO_CORE_BASE_ALLOCATOR_HPP_ + + +#include +#include +#include +#include +#include +#include +#include + + +#include + + +namespace gko { + +/** + * @internal + * + * C++ standard library-compatible allocator that uses an executor for + * allocations. + * + * @tparam T the type of the allocated elements. + */ +template +class ExecutorAllocator { +public: + using value_type = T; + using propagate_on_container_copy_assignment = std::true_type; + using propagate_on_container_move_assignment = std::true_type; + using propagate_on_container_swap = std::true_type; + + /** + * Constructs an allocator from a given executor. + * + * This function works with both const and non-const ExecType, + * as long as it is derived from gko::Executor. + * @param exec the executor + * @tparam ExecType the static type of the executor + */ + template + ExecutorAllocator(std::shared_ptr exec) : exec_{std::move(exec)} + {} + + /** + * Constructs an allocator for another element type from a given executor. + * + * This is related to `std::allocator_traits::template rebind` and its + * use in more advanced data structures. + * + * @param other the other executor + * @tparam U the element type of the allocator to be constructed. + */ + template + explicit ExecutorAllocator(const ExecutorAllocator &other) + : exec_{other.get_executor()} + {} + + /** Returns the executor used by this allocator. */ + std::shared_ptr get_executor() const { return exec_; } + + /** + * Allocates a memory area of the given size. + * + * @param n the number of elements to allocate + * @return the pointer to a newly allocated memory area of `n` elements. + */ + T *allocate(std::size_t n) const { return exec_->alloc(n); } + + /** + * Frees a memory area that was allocated by this allocator. + * + * @param ptr The memory area to free, previously returned by `allocate`. + * + * @note The second parameter is unused. + */ + void deallocate(T *ptr, std::size_t) const { exec_->free(ptr); } + + /** + * Compares two ExecutorAllocators for equality + * + * @param l the first allocator + * @param r the second allocator + * @return true iff the two allocators use the same executor + */ + template + friend bool operator==(const ExecutorAllocator &l, + const ExecutorAllocator &r) + { + return l.get_executor() == r.get_executor(); + } + + /** + * Compares two ExecutorAllocators for inequality + * + * @param l the first allocator + * @param r the second allocator + * @return true iff the two allocators use different executors + */ + template + friend bool operator!=(const ExecutorAllocator &l, + const ExecutorAllocator &r) + { + return !(l == r); + } + +private: + std::shared_ptr exec_; +}; + + +// Convenience type aliases +/** std::vector using an ExecutorAllocator. */ +template +using vector = std::vector>; + +/** std::set using an ExecutorAllocator. */ +template +using set = std::set, gko::ExecutorAllocator>; + +/** std::map using an ExecutorAllocator. */ +template +using map = std::map, + gko::ExecutorAllocator>>; + +/** std::unordered_set using an ExecutorAllocator. */ +template +using unordered_set = + std::unordered_set, std::equal_to, + gko::ExecutorAllocator>; + +/** std::unordered_map using an ExecutorAllocator. */ +template +using unordered_map = + std::unordered_map, std::equal_to, + gko::ExecutorAllocator>>; + + +} // namespace gko + +#endif // GKO_CORE_BASE_ALLOCATOR_HPP_ \ No newline at end of file diff --git a/core/base/array.cpp b/core/base/array.cpp new file mode 100644 index 00000000000..21d8b5f3326 --- /dev/null +++ b/core/base/array.cpp @@ -0,0 +1,71 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include "core/components/precision_conversion.hpp" + + +namespace gko { +namespace conversion { + + +GKO_REGISTER_OPERATION(convert, components::convert_precision); + + +} // namespace conversion + + +namespace detail { + + +template +void convert_data(std::shared_ptr exec, size_type size, + const SourceType *src, TargetType *dst) +{ + exec->run(conversion::make_convert(size, src, dst)); +} + + +#define GKO_DECLARE_ARRAY_CONVERSION(From, To) \ + void convert_data(std::shared_ptr, size_type, \ + const From *, To *) + +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_ARRAY_CONVERSION); + + +} // namespace detail +} // namespace gko diff --git a/core/base/combination.cpp b/core/base/combination.cpp index 567d8d9778b..dd95298858e 100644 --- a/core/base/combination.cpp +++ b/core/base/combination.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,6 +59,45 @@ inline void initialize_scalars(std::shared_ptr exec, } // namespace +template +std::unique_ptr Combination::transpose() const +{ + auto transposed = Combination::create(this->get_executor()); + transposed->set_size(gko::transpose(this->get_size())); + // copy coefficients + for (auto &coef : get_coefficients()) { + transposed->coefficients_.push_back(share(coef->clone())); + } + // transpose operators + for (auto &op : get_operators()) { + transposed->operators_.push_back( + share(as(op)->transpose())); + } + + return std::move(transposed); +} + + +template +std::unique_ptr Combination::conj_transpose() const +{ + auto transposed = Combination::create(this->get_executor()); + transposed->set_size(gko::transpose(this->get_size())); + // conjugate coefficients! + for (auto &coef : get_coefficients()) { + transposed->coefficients_.push_back( + share(as(coef)->conj_transpose())); + } + // conjugate-transpose operators + for (auto &op : get_operators()) { + transposed->operators_.push_back( + share(as(op)->conj_transpose())); + } + + return std::move(transposed); +} + + template void Combination::apply_impl(const LinOp *b, LinOp *x) const { diff --git a/core/base/composition.cpp b/core/base/composition.cpp index ea15b5087c4..6fb0171b56e 100644 --- a/core/base/composition.cpp +++ b/core/base/composition.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,50 +33,143 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include + + #include +#include "core/components/fill_array.hpp" + + namespace gko { -namespace { +namespace composition { + + +GKO_REGISTER_OPERATION(fill_array, components::fill_array); -template -inline void allocate_vectors(OpIterator begin, OpIterator end, VecIterator res) +} // namespace composition + + +template +std::unique_ptr apply_inner_operators( + const std::vector> &operators, + Array &storage, const LinOp *rhs) { - for (auto it = begin; it != end; ++it, ++res) { - if (*res != nullptr && (*res)->get_size()[0] == (*it)->get_size()[0]) { - continue; + using Dense = matrix::Dense; + // determine amount of necessary storage: + // maximum sum of two subsequent intermediate vectors + // (and the out dimension of the last op if we only have one operator) + auto num_rhs = rhs->get_size()[1]; + auto max_intermediate_size = std::accumulate( + begin(operators) + 1, end(operators) - 1, + operators.back()->get_size()[0], + [](size_type acc, std::shared_ptr op) { + return std::max(acc, op->get_size()[0] + op->get_size()[1]); + }); + auto storage_size = max_intermediate_size * num_rhs; + storage.resize_and_reset(storage_size); + + // apply inner vectors + auto exec = rhs->get_executor(); + auto data = storage.get_data(); + // apply last operator + auto op_size = operators.back()->get_size(); + auto out_dim = gko::dim<2>{op_size[0], num_rhs}; + auto out_size = out_dim[0] * num_rhs; + auto out = Dense::create( + exec, out_dim, Array::view(exec, out_size, data), num_rhs); + // for operators with initial guess: set initial guess + if (operators.back()->apply_uses_initial_guess()) { + if (op_size[0] == op_size[1]) { + // square matrix: we can use the previous output + exec->copy(out_size, as(rhs)->get_const_values(), + out->get_values()); + } else { + // rectangular matrix: we can't do better than zeros + exec->run(composition::make_fill_array(out->get_values(), out_size, + zero())); } - *res = matrix::Dense::create( - (*it)->get_executor(), gko::dim<2>{(*it)->get_size()[0], 1}); } + operators.back()->apply(rhs, lend(out)); + // apply following operators + // alternate intermediate vectors between beginning/end of storage + auto reversed_storage = true; + for (auto i = operators.size() - 2; i > 0; --i) { + // swap in and out + auto in = std::move(out); + // build new intermediate vector + op_size = operators[i]->get_size(); + out_dim[0] = op_size[0]; + out_size = out_dim[0] * num_rhs; + auto out_data = + data + (reversed_storage ? storage_size - out_size : size_type{}); + reversed_storage = !reversed_storage; + out = Dense::create(exec, out_dim, + Array::view(exec, out_size, out_data), + num_rhs); + // for operators with initial guess: set initial guess + if (operators[i]->apply_uses_initial_guess()) { + if (op_size[0] == op_size[1]) { + // square matrix: we can use the previous output + exec->copy(out_size, in->get_const_values(), out->get_values()); + } else { + // rectangular matrix: we can't do better than zeros + exec->run(composition::make_fill_array( + out->get_values(), out_size, zero())); + } + } + // apply operator + operators[i]->apply(lend(in), lend(out)); + } + + return std::move(out); } -inline const LinOp *apply_inner_operators( - const std::vector> &operators, - const std::vector> &intermediate, const LinOp *rhs) +template +std::unique_ptr Composition::transpose() const { - for (auto i = operators.size() - 1; i > 0u; --i) { - auto solution = lend(intermediate[i - 1]); - operators[i]->apply(rhs, solution); - rhs = solution; - } - return rhs; + auto transposed = Composition::create(this->get_executor()); + transposed->set_size(gko::transpose(this->get_size())); + // transpose and reverse operators + std::transform(this->get_operators().rbegin(), this->get_operators().rend(), + std::back_inserter(transposed->operators_), + [](const std::shared_ptr &op) { + return share(as(op)->transpose()); + }); + + return std::move(transposed); } -} // namespace +template +std::unique_ptr Composition::conj_transpose() const +{ + auto transposed = Composition::create(this->get_executor()); + transposed->set_size(gko::transpose(this->get_size())); + // conjugate-transpose and reverse operators + std::transform(this->get_operators().rbegin(), this->get_operators().rend(), + std::back_inserter(transposed->operators_), + [](const std::shared_ptr &op) { + return share(as(op)->conj_transpose()); + }); + + return std::move(transposed); +} template void Composition::apply_impl(const LinOp *b, LinOp *x) const { - cache_.intermediate.resize(operators_.size() - 1); - allocate_vectors(begin(operators_) + 1, end(operators_), - begin(cache_.intermediate)); - operators_[0]->apply( - apply_inner_operators(operators_, cache_.intermediate, b), x); + if (operators_.size() > 1) { + operators_[0]->apply( + lend(apply_inner_operators(operators_, storage_, b)), x); + } else { + operators_[0]->apply(b, x); + } } @@ -84,12 +177,13 @@ template void Composition::apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, LinOp *x) const { - cache_.intermediate.resize(operators_.size() - 1); - allocate_vectors(begin(operators_) + 1, end(operators_), - begin(cache_.intermediate)); - operators_[0]->apply( - alpha, apply_inner_operators(operators_, cache_.intermediate, b), beta, - x); + if (operators_.size() > 1) { + operators_[0]->apply( + alpha, lend(apply_inner_operators(operators_, storage_, b)), beta, + x); + } else { + operators_[0]->apply(alpha, b, beta, x); + } } diff --git a/core/base/executor.cpp b/core/base/executor.cpp index 4c2d6828ee6..9d80ad818f0 100644 --- a/core/base/executor.cpp +++ b/core/base/executor.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -49,6 +49,10 @@ void Operation::run(std::shared_ptr executor) const GKO_NOT_IMPLEMENTED; +void Operation::run(std::shared_ptr executor) const + GKO_NOT_IMPLEMENTED; + + void Operation::run(std::shared_ptr executor) const { this->run(static_cast>(executor)); diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 84be68eca66..1d35b09f8e7 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44,6 +44,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#elif defined(__HIP_DEVICE_COMPILE__) + + +#include + + #endif // __CUDA_ARCH__ @@ -301,16 +307,16 @@ struct precision_converter { */ class half { public: - GKO_ATTRIBUTES half() noexcept = default; + half() noexcept = default; GKO_ATTRIBUTES half(float32 val) noexcept { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) const auto tmp = __float2half_rn(val); data_ = reinterpret_cast(tmp); -#else // __CUDA_ARCH__ +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) data_ = float2half(reinterpret_cast(val)); -#endif // __CUDA_ARCH__ +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } GKO_ATTRIBUTES half(float64 val) noexcept : half(static_cast(val)) @@ -318,12 +324,12 @@ class half { GKO_ATTRIBUTES operator float32() const noexcept { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return __half2float(reinterpret_cast(data_)); -#else // __CUDA_ARCH__ +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) const auto bits = half2float(data_); return reinterpret_cast(bits); -#endif // __CUDA_ARCH__ +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } GKO_ATTRIBUTES operator float64() const noexcept @@ -331,6 +337,14 @@ class half { return static_cast(static_cast(*this)); } + GKO_ATTRIBUTES half operator-() const noexcept + { + auto res = *this; + // flip sign bit + res.data_ ^= f16_traits::sign_mask; + return res; + } + private: using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; @@ -434,7 +448,7 @@ class truncated { static_assert(component_id < num_components, "This type doesn't have that many components"); - GKO_ATTRIBUTES truncated() noexcept = default; + truncated() noexcept = default; GKO_ATTRIBUTES explicit truncated(const float_type &val) noexcept { @@ -450,6 +464,16 @@ class truncated { return reinterpret_cast(bits); } + GKO_ATTRIBUTES truncated operator-() const noexcept + { + auto res = *this; + // flip sign bit + if (ComponentId == 0) { + res.data_ ^= bits_type{1} << (8 * sizeof(bits_type) - 1); + } + return res; + } + private: bits_type data_; }; diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index a9d03603f4c..b7efd21dfe0 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -292,7 +292,7 @@ class IteratorFactory { Reference operator*() const { return {parent_, arr_index_}; } - Reference operator[](size_t idx) const + Reference operator[](difference_type idx) const { return {parent_, arr_index_ + idx}; } diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 26995be0b4d..ab2b96cce29 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -139,6 +139,7 @@ class mtx_io { struct : entry_format { /** * reads entry from the input stream + * * @param is the input stream * * @return the matrix entry. @@ -152,6 +153,7 @@ class mtx_io { /** * writes entry to the output stream + * * @param os the output stream * @param value the matrix entry to be written */ @@ -186,6 +188,7 @@ class mtx_io { struct : entry_format { /** * reads entry from the input stream + * * @param is the input stream * * @return the matrix entry. @@ -197,6 +200,7 @@ class mtx_io { /** * writes entry to the output stream + * * @param os the output stream * @param value the matrix entry to be written */ @@ -237,6 +241,7 @@ class mtx_io { struct : entry_format { /** * reads entry from the input stream + * * @param dummy input stream * * @return the matrix entry(one). @@ -248,6 +253,7 @@ class mtx_io { /** * writes entry to the output stream + * * @param dummy output stream * @param dummy matrix entry to be written */ @@ -284,6 +290,7 @@ class mtx_io { struct : storage_modifier { /** * get the reservation size + * * @param num_rows the number of rows * @param num_cols the number of columns * @param num_nonzeros the number of non-zeros @@ -298,6 +305,7 @@ class mtx_io { /** * Insert an entry + * * @param row The row where the entry is to be inserted. * @param col The column where the entry is to be inserted. * @param entry the entry to be inserted. @@ -337,6 +345,7 @@ class mtx_io { /** * Insert an entry + * * @param row The row where the entry is to be inserted. * @param col The column where the entry is to be inserted. * @param entry the entry to be inserted. @@ -366,6 +375,7 @@ class mtx_io { struct : storage_modifier { /** * get the reservation size + * * @param num_rows * @param num_cols * @param num_nonzeros the number of non-zeros @@ -380,6 +390,7 @@ class mtx_io { /** * Insert an entry + * * @param row The row where the entry is to be inserted. * @param col The column where the entry is to be inserted. * @param entry the entry to be inserted. @@ -409,6 +420,7 @@ class mtx_io { struct : storage_modifier { /** * get the reservation size + * * @param num_rows * @param num_cols * @param num_nonzeros the number of non-zeros @@ -423,6 +435,7 @@ class mtx_io { /** * Insert an entry + * * @param row The row where the entry is to be inserted. * @param col The column where the entry is to be inserted. * @param entry the entry to be inserted. @@ -667,6 +680,7 @@ class mtx_io { /** * reads and parses the first line of the header + * * @param is the input stream * * @return the data containing the description @@ -711,6 +725,7 @@ class mtx_io { /** * reads and parses the header + * * @param is The input stream to read the header from. * * @return the header data diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp index f2cbaeb6587..a7a6a0b004b 100644 --- a/core/base/perturbation.cpp +++ b/core/base/perturbation.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/base/utils.hpp b/core/base/utils.hpp new file mode 100644 index 00000000000..4e6fbc1dfce --- /dev/null +++ b/core/base/utils.hpp @@ -0,0 +1,56 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_INTERNAL_CORE_BASE_UTILS_HPP_ +#define GKO_INTERNAL_CORE_BASE_UTILS_HPP_ + +#include + + +namespace gko { +namespace kernels { + + +template +GKO_ATTRIBUTES GKO_INLINE ValueType checked_load(const ValueType *p, + IndexType i, IndexType size, + ValueType sentinel) +{ + return i < size ? p[i] : sentinel; +} + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_INTERNAL_CORE_BASE_UTILS_HPP_ \ No newline at end of file diff --git a/core/base/version.cpp b/core/base/version.cpp index 16846760594..7993cee5cae 100644 --- a/core/base/version.cpp +++ b/core/base/version.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -64,6 +64,8 @@ std::ostream &operator<<(std::ostream &os, const version_info &ver_info) print_version(os, ver_info.omp_version); os << "\n the CUDA module is "; print_version(os, ver_info.cuda_version); + os << "\n the HIP module is "; + print_version(os, ver_info.hip_version); return os; } diff --git a/core/components/fill_array.hpp b/core/components/fill_array.hpp new file mode 100644 index 00000000000..7bafb8aecb4 --- /dev/null +++ b/core/components/fill_array.hpp @@ -0,0 +1,101 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_COMPONENTS_FILL_ARRAY_HPP_ +#define GKO_CORE_COMPONENTS_FILL_ARRAY_HPP_ + + +#include + + +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_FILL_ARRAY_KERNEL(ValueType) \ + void fill_array(std::shared_ptr exec, \ + ValueType *data, size_type num_entries, ValueType val) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_FILL_ARRAY_KERNEL(IndexType) + + +namespace omp { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace omp + + +namespace cuda { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace cuda + + +namespace reference { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace reference + + +namespace hip { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_COMPONENTS_FILL_ARRAY_HPP_ diff --git a/core/components/precision_conversion.hpp b/core/components/precision_conversion.hpp new file mode 100644 index 00000000000..719c596c34e --- /dev/null +++ b/core/components/precision_conversion.hpp @@ -0,0 +1,102 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_COMPONENTS_PRECISION_CONVERSION_HPP_ +#define GKO_CORE_COMPONENTS_PRECISION_CONVERSION_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_CONVERT_PRECISION_KERNEL(SourceType, TargetType) \ + void convert_precision(std::shared_ptr exec, \ + size_type size, const SourceType *in, \ + TargetType *out) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_CONVERT_PRECISION_KERNEL(SourceType, TargetType) + + +namespace omp { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace omp + + +namespace cuda { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace cuda + + +namespace reference { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace reference + + +namespace hip { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_PRECISION_CONVERSION_HPP_ diff --git a/core/components/prefix_sum.hpp b/core/components/prefix_sum.hpp new file mode 100644 index 00000000000..d171be831aa --- /dev/null +++ b/core/components/prefix_sum.hpp @@ -0,0 +1,100 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_ +#define GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_ + + +#include + + +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType) \ + void prefix_sum(std::shared_ptr exec, \ + IndexType *counts, size_type num_entries) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType) + + +namespace omp { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace omp + + +namespace cuda { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace cuda + + +namespace reference { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace reference + + +namespace hip { +namespace components { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace components +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + +#endif // GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_ diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt index 4ca355a3d51..94dfc8ab9f0 100644 --- a/core/device_hooks/CMakeLists.txt +++ b/core/device_hooks/CMakeLists.txt @@ -3,6 +3,7 @@ if(NOT GINKGO_BUILD_CUDA) $ cuda_hooks.cpp) ginkgo_compile_features(ginkgo_cuda) + target_link_libraries(ginkgo_cuda PUBLIC ginkgo_hip) ginkgo_default_includes(ginkgo_cuda) ginkgo_install_library(ginkgo_cuda cuda) endif() @@ -13,6 +14,7 @@ if (NOT GINKGO_BUILD_OMP) omp_hooks.cpp) ginkgo_compile_features(ginkgo_omp) target_link_libraries(ginkgo_omp PUBLIC ginkgo_cuda) + target_link_libraries(ginkgo_omp PUBLIC ginkgo_hip) ginkgo_default_includes(ginkgo_omp) ginkgo_install_library(ginkgo_omp omp) endif() @@ -25,3 +27,12 @@ if (NOT GINKGO_BUILD_REFERENCE) ginkgo_default_includes(ginkgo_reference) ginkgo_install_library(ginkgo_reference reference) endif() + +if(NOT GINKGO_BUILD_HIP) + add_library(ginkgo_hip + $ + hip_hooks.cpp) + ginkgo_compile_features(ginkgo_hip) + ginkgo_default_includes(ginkgo_hip) + ginkgo_install_library(ginkgo_hip hip) +endif() diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 05a7f8bc136..53798c2b596 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array.hpp" +#include "core/components/precision_conversion.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/factorization/factorization_kernels.hpp" +#include "core/factorization/ilu_kernels.hpp" +#include "core/factorization/par_ict_kernels.hpp" #include "core/factorization/par_ilu_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/dense_kernels.hpp" @@ -41,7 +48,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/hybrid_kernels.hpp" #include "core/matrix/sellp_kernels.hpp" #include "core/matrix/sparsity_csr_kernels.hpp" +#include "core/preconditioner/isai_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" +#include "core/solver/bicg_kernels.hpp" #include "core/solver/bicgstab_kernels.hpp" #include "core/solver/cg_kernels.hpp" #include "core/solver/cgs_kernels.hpp" @@ -51,7 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/lower_trs_kernels.hpp" #include "core/solver/upper_trs_kernels.hpp" #include "core/stop/criterion_kernels.hpp" -#include "core/stop/residual_norm_reduction_kernels.hpp" +#include "core/stop/residual_norm_kernels.hpp" #ifndef GKO_HOOK_MODULE @@ -62,6 +71,32 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace kernels { namespace GKO_HOOK_MODULE { +namespace components { + + +template +GKO_DECLARE_CONVERT_PRECISION_KERNEL(SourceType, TargetType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); + +template +GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); +// explicitly instantiate for size_type, as this is used in the SellP format +template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); + +template +GKO_DECLARE_FILL_ARRAY_KERNEL(IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type); + + +} // namespace components + + namespace dense { @@ -164,6 +199,29 @@ GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL); +template +GKO_DECLARE_ROW_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL); + +template +GKO_DECLARE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COLUMN_PERMUTE_KERNEL); + +template +GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL); + +template +GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL); + } // namespace dense @@ -190,6 +248,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL); } // namespace cg +namespace bicg { + + +template +GKO_DECLARE_BICG_INITIALIZE_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); + +template +GKO_DECLARE_BICG_STEP_1_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); + +template +GKO_DECLARE_BICG_STEP_2_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); + + +} // namespace bicg + + namespace lower_trs { @@ -422,6 +502,22 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +template +GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); + +template +GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); + +template +GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); + template GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); @@ -469,6 +565,30 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); +template +GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); + +template +GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL); + +template +GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + +template +GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + template GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); @@ -667,6 +787,18 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); +template +GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); + +template +GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); + template GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); @@ -680,20 +812,102 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); } // namespace jacobi -namespace par_ilu_factorization { +namespace isai { template -GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType) +GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL); + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); template -GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL(ValueType, IndexType) +GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL); + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); + +template +GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai + + +namespace factorization { + + +template +GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); + +template +GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); + +template +GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); + +template +GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); + +template +GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization + + +namespace ilu_factorization { + + +template +GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ILU_COMPUTE_LU_KERNEL); + + +} // namespace ilu_factorization + + +namespace par_ict_factorization { + + +template +GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + +template +GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization + + +namespace par_ilu_factorization { + template GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL(ValueType, IndexType) @@ -705,6 +919,43 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilu_factorization +namespace par_ilut_factorization { + + +template +GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + +template +GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + +template +GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + +template +GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + +template +GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL(ValueType, IndexType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); + + +} // namespace par_ilut_factorization + + namespace set_all_statuses { @@ -715,16 +966,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); } // namespace set_all_statuses -namespace residual_norm_reduction { +namespace residual_norm { template -GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(ValueType) +GKO_DECLARE_RESIDUAL_NORM_KERNEL(ValueType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( + GKO_DECLARE_RESIDUAL_NORM_KERNEL); -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace GKO_HOOK_MODULE } // namespace kernels } // namespace gko diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index 884b85425b3..d41d77d24d9 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include +#include + + #include #include +#include #include @@ -50,10 +55,10 @@ version version_info::get_cuda_version() noexcept std::shared_ptr CudaExecutor::create( - int device_id, std::shared_ptr master) + int device_id, std::shared_ptr master, bool device_reset) { return std::shared_ptr( - new CudaExecutor(device_id, std::move(master))); + new CudaExecutor(device_id, std::move(master), device_reset)); } @@ -70,8 +75,7 @@ void CudaExecutor::raw_free(void *ptr) const noexcept } -void *CudaExecutor::raw_alloc(size_type num_bytes) const - GKO_NOT_COMPILED(nvidia); +void *CudaExecutor::raw_alloc(size_type num_bytes) const GKO_NOT_COMPILED(cuda); void CudaExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes, @@ -84,6 +88,11 @@ void CudaExecutor::raw_copy_to(const CudaExecutor *, size_type num_bytes, GKO_NOT_COMPILED(cuda); +void CudaExecutor::raw_copy_to(const HipExecutor *, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const + GKO_NOT_COMPILED(cuda); + + void CudaExecutor::synchronize() const GKO_NOT_COMPILED(cuda); diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp new file mode 100644 index 00000000000..a2e288b4157 --- /dev/null +++ b/core/device_hooks/hip_hooks.cpp @@ -0,0 +1,135 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +#include +#include +#include +#include + + +namespace gko { + + +version version_info::get_hip_version() noexcept +{ + // We just return 1.1.0 with a special "not compiled" tag in placeholder + // modules. + return {1, 1, 0, "not compiled"}; +} + + +std::shared_ptr HipExecutor::create( + int device_id, std::shared_ptr master, bool device_reset) +{ + return std::shared_ptr( + new HipExecutor(device_id, std::move(master), device_reset)); +} + + +void OmpExecutor::raw_copy_to(const HipExecutor *, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const + GKO_NOT_COMPILED(hip); + + +void HipExecutor::raw_free(void *ptr) const noexcept +{ + // Free must never fail, as it can be called in destructors. + // If the nvidia module was not compiled, the library couldn't have + // allocated the memory, so there is no need to deallocate it. +} + + +void *HipExecutor::raw_alloc(size_type num_bytes) const GKO_NOT_COMPILED(hip); + + +void HipExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const + GKO_NOT_COMPILED(hip); + + +void HipExecutor::raw_copy_to(const CudaExecutor *, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const + GKO_NOT_COMPILED(hip); + + +void HipExecutor::raw_copy_to(const HipExecutor *, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const + GKO_NOT_COMPILED(hip); + + +void HipExecutor::synchronize() const GKO_NOT_COMPILED(hip); + + +void HipExecutor::run(const Operation &op) const +{ + op.run( + std::static_pointer_cast(this->shared_from_this())); +} + + +std::string HipError::get_error(int64) +{ + return "ginkgo HIP module is not compiled"; +} + + +std::string HipblasError::get_error(int64) +{ + return "ginkgo HIP module is not compiled"; +} + + +std::string HipsparseError::get_error(int64) +{ + return "ginkgo HIP module is not compiled"; +} + + +int HipExecutor::get_num_devices() { return 0; } + + +void HipExecutor::set_gpu_property() {} + + +void HipExecutor::init_handles() {} + + +} // namespace gko + + +#define GKO_HOOK_MODULE hip +#include "core/device_hooks/common_kernels.inc.cpp" +#undef GKO_HOOK_MODULE diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp index 4fb251758a8..131fa51a4d8 100644 --- a/core/device_hooks/omp_hooks.cpp +++ b/core/device_hooks/omp_hooks.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/device_hooks/reference_hooks.cpp b/core/device_hooks/reference_hooks.cpp index 7e7ab287ca5..ea7742776c8 100644 --- a/core/device_hooks/reference_hooks.cpp +++ b/core/device_hooks/reference_hooks.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include diff --git a/core/devices/CMakeLists.txt b/core/devices/CMakeLists.txt index 67e8a6fab58..2a5626c0018 100644 --- a/core/devices/CMakeLists.txt +++ b/core/devices/CMakeLists.txt @@ -8,4 +8,5 @@ endfunction() add_subdirectory(omp) add_subdirectory(cuda) +add_subdirectory(hip) add_subdirectory(reference) diff --git a/core/devices/cuda/executor.cpp b/core/devices/cuda/executor.cpp index b377b2afa94..3566578a681 100644 --- a/core/devices/cuda/executor.cpp +++ b/core/devices/cuda/executor.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/devices/hip/CMakeLists.txt b/core/devices/hip/CMakeLists.txt new file mode 100644 index 00000000000..7f855b3e2e9 --- /dev/null +++ b/core/devices/hip/CMakeLists.txt @@ -0,0 +1,3 @@ +ginkgo_add_object_library(ginkgo_hip_device + executor.cpp) + diff --git a/core/devices/hip/executor.cpp b/core/devices/hip/executor.cpp new file mode 100644 index 00000000000..f4787523290 --- /dev/null +++ b/core/devices/hip/executor.cpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +namespace gko { + + +std::shared_ptr HipExecutor::get_master() noexcept { return master_; } + + +std::shared_ptr HipExecutor::get_master() const noexcept +{ + return master_; +} + + +int HipExecutor::num_execs[max_devices]; + + +std::mutex HipExecutor::mutex[max_devices]; + + +} // namespace gko diff --git a/core/devices/omp/executor.cpp b/core/devices/omp/executor.cpp index 193672ef229..e53a1b53c43 100644 --- a/core/devices/omp/executor.cpp +++ b/core/devices/omp/executor.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -68,7 +68,9 @@ void *OmpExecutor::raw_alloc(size_type num_bytes) const void OmpExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes, const void *src_ptr, void *dest_ptr) const { - std::memcpy(dest_ptr, src_ptr, num_bytes); + if (num_bytes > 0) { + std::memcpy(dest_ptr, src_ptr, num_bytes); + } } diff --git a/core/devices/reference/dummy.cpp b/core/devices/reference/dummy.cpp index 971afc395e3..a2f3f380cbe 100644 --- a/core/devices/reference/dummy.cpp +++ b/core/devices/reference/dummy.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp new file mode 100644 index 00000000000..f7c25964dde --- /dev/null +++ b/core/factorization/factorization_kernels.hpp @@ -0,0 +1,142 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_FACTORIZATION_KERNELS_HPP_ +#define GKO_CORE_FACTORIZATION_FACTORIZATION_KERNELS_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, \ + IndexType) \ + void add_diagonal_elements(std::shared_ptr exec, \ + matrix::Csr *mtx, \ + bool is_sorted) + +#define GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \ + IndexType) \ + void initialize_row_ptrs_l_u( \ + std::shared_ptr exec, \ + const matrix::Csr *system_matrix, \ + IndexType *l_row_ptrs, IndexType *u_row_ptrs) + +#define GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL(ValueType, IndexType) \ + void initialize_l_u( \ + std::shared_ptr exec, \ + const matrix::Csr *system_matrix, \ + matrix::Csr *l_factor, \ + matrix::Csr *u_factor) + +#define GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL(ValueType, \ + IndexType) \ + void initialize_row_ptrs_l( \ + std::shared_ptr exec, \ + const matrix::Csr *system_matrix, \ + IndexType *l_row_ptrs) + +#define GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL(ValueType, IndexType) \ + void initialize_l(std::shared_ptr exec, \ + const matrix::Csr *system_matrix, \ + matrix::Csr *l_factor, \ + bool diag_sqrt) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, \ + IndexType); \ + template \ + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \ + IndexType); \ + template \ + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL(ValueType, \ + IndexType); \ + template \ + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL(ValueType, IndexType) + + +namespace omp { +namespace factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace factorization +} // namespace omp + + +namespace cuda { +namespace factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace factorization +} // namespace cuda + + +namespace reference { +namespace factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace factorization +} // namespace reference + + +namespace hip { +namespace factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace factorization +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_FACTORIZATION_KERNELS_HPP_ diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp new file mode 100644 index 00000000000..c2f397151d3 --- /dev/null +++ b/core/factorization/ilu.cpp @@ -0,0 +1,126 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/factorization/ilu_kernels.hpp" +#include "core/factorization/par_ilu_kernels.hpp" + + +namespace gko { +namespace factorization { +namespace ilu_factorization { + + +GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::compute_lu); +GKO_REGISTER_OPERATION(add_diagonal_elements, + factorization::add_diagonal_elements); +GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u, + factorization::initialize_row_ptrs_l_u); +GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u); + + +} // namespace ilu_factorization + + +template +std::unique_ptr> Ilu::generate_l_u( + const std::shared_ptr &system_matrix) const +{ + GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix); + + const auto exec = this->get_executor(); + + // Converts the system matrix to CSR. + // Throws an exception if it is not convertible. + auto local_system_matrix = matrix_type::create(exec); + as>(system_matrix.get()) + ->convert_to(local_system_matrix.get()); + + // Add explicit diagonal zero elements if they are missing + exec->run(ilu_factorization::make_add_diagonal_elements( + local_system_matrix.get(), false)); + + // Compute LU factorization + exec->run(ilu_factorization::make_compute_ilu(local_system_matrix.get())); + + // Separate L and U factors: nnz + const auto matrix_size = local_system_matrix->get_size(); + const auto num_rows = matrix_size[0]; + Array l_row_ptrs{exec, num_rows + 1}; + Array u_row_ptrs{exec, num_rows + 1}; + exec->run(ilu_factorization::make_initialize_row_ptrs_l_u( + local_system_matrix.get(), l_row_ptrs.get_data(), + u_row_ptrs.get_data())); + + // Get nnz from device memory + auto l_nnz = static_cast( + exec->copy_val_to_host(l_row_ptrs.get_data() + num_rows)); + auto u_nnz = static_cast( + exec->copy_val_to_host(u_row_ptrs.get_data() + num_rows)); + + // Init arrays + Array l_col_idxs{exec, l_nnz}; + Array l_vals{exec, l_nnz}; + std::shared_ptr l_factor = matrix_type::create( + exec, matrix_size, std::move(l_vals), std::move(l_col_idxs), + std::move(l_row_ptrs), parameters_.l_strategy); + Array u_col_idxs{exec, u_nnz}; + Array u_vals{exec, u_nnz}; + std::shared_ptr u_factor = matrix_type::create( + exec, matrix_size, std::move(u_vals), std::move(u_col_idxs), + std::move(u_row_ptrs), parameters_.u_strategy); + + // Separate L and U: columns and values + exec->run(ilu_factorization::make_initialize_l_u( + local_system_matrix.get(), l_factor.get(), u_factor.get())); + + return Composition::create(std::move(l_factor), + std::move(u_factor)); +} + + +#define GKO_DECLARE_ILU(ValueType, IndexType) class Ilu +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU); + + +} // namespace factorization +} // namespace gko diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp new file mode 100644 index 00000000000..17602ac4ab4 --- /dev/null +++ b/core/factorization/ilu_kernels.hpp @@ -0,0 +1,105 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_ +#define GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_ + + +#include + + +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType) \ + void compute_lu(std::shared_ptr exec, \ + matrix::Csr *system_matrix) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType) + + +namespace omp { +namespace ilu_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace ilu_factorization +} // namespace omp + + +namespace cuda { +namespace ilu_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace ilu_factorization +} // namespace cuda + + +namespace reference { +namespace ilu_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace ilu_factorization +} // namespace reference + + +namespace hip { +namespace ilu_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace ilu_factorization +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_ diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp new file mode 100644 index 00000000000..46e9f8cebb6 --- /dev/null +++ b/core/factorization/par_ict.cpp @@ -0,0 +1,305 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/factorization/par_ict_kernels.hpp" +#include "core/factorization/par_ilu_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" + + +namespace gko { +namespace factorization { +namespace par_ict_factorization { + + +GKO_REGISTER_OPERATION(threshold_select, + par_ilut_factorization::threshold_select); +GKO_REGISTER_OPERATION(threshold_filter, + par_ilut_factorization::threshold_filter); +GKO_REGISTER_OPERATION(threshold_filter_approx, + par_ilut_factorization::threshold_filter_approx); +GKO_REGISTER_OPERATION(add_candidates, par_ict_factorization::add_candidates); +GKO_REGISTER_OPERATION(compute_factor, par_ict_factorization::compute_factor); + +GKO_REGISTER_OPERATION(initialize_row_ptrs_l, + factorization::initialize_row_ptrs_l); +GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l); + +GKO_REGISTER_OPERATION(csr_conj_transpose, csr::conj_transpose); +GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); +GKO_REGISTER_OPERATION(spgemm, csr::spgemm); + + +} // namespace par_ict_factorization + + +using par_ict_factorization::make_add_candidates; +using par_ict_factorization::make_compute_factor; +using par_ict_factorization::make_convert_to_coo; +using par_ict_factorization::make_csr_conj_transpose; +using par_ict_factorization::make_initialize_l; +using par_ict_factorization::make_initialize_row_ptrs_l; +using par_ict_factorization::make_spgemm; +using par_ict_factorization::make_threshold_filter; +using par_ict_factorization::make_threshold_filter_approx; +using par_ict_factorization::make_threshold_select; + + +template +struct ParIctState { + using CsrMatrix = matrix::Csr; + using CooMatrix = matrix::Coo; + using CsrBuilder = matrix::CsrBuilder; + using CooBuilder = matrix::CooBuilder; + using Scalar = matrix::Dense; + // the executor on which the kernels are being executed + std::shared_ptr exec; + // max number of non-zeros L is supposed to have + IndexType l_nnz_limit; + // use the approximate selection/filter kernels? + bool use_approx_select; + // system matrix A + const CsrMatrix *system_matrix; + // current lower factor L + std::unique_ptr l; + // current upper factor L^H + std::unique_ptr lt; + // current product L * L^H + std::unique_ptr llt; + // temporary lower factor L' before filtering + std::unique_ptr l_new; + // lower factor L currently being updated with asynchronous iterations + std::unique_ptr l_coo; + // temporary array for threshold selection + Array selection_tmp; + // temporary array for threshold selection + Array> selection_tmp2; + // strategy to be used by the lower factor + std::shared_ptr l_strategy; + // strategy to be used by the upper factor + std::shared_ptr lt_strategy; + + ParIctState(std::shared_ptr exec_in, + const CsrMatrix *system_matrix_in, + std::unique_ptr l_in, IndexType l_nnz_limit, + bool use_approx_select, + std::shared_ptr l_strategy_, + std::shared_ptr lt_strategy_) + : exec{std::move(exec_in)}, + l_nnz_limit{l_nnz_limit}, + use_approx_select{use_approx_select}, + system_matrix{system_matrix_in}, + l{std::move(l_in)}, + selection_tmp{exec}, + selection_tmp2{exec}, + l_strategy{std::move(l_strategy_)}, + lt_strategy{std::move(lt_strategy_)} + { + auto mtx_size = system_matrix->get_size(); + auto l_nnz = l->get_num_stored_elements(); + lt = CsrMatrix::create(exec, mtx_size, l_nnz); + llt = CsrMatrix::create(exec, mtx_size); + l_new = CsrMatrix::create(exec, mtx_size); + l_coo = CooMatrix::create(exec, mtx_size); + exec->run(make_csr_conj_transpose(l.get(), lt.get())); + } + + std::unique_ptr> to_factors() && + { + l->set_strategy(l_strategy); + lt->set_strategy(lt_strategy); + return Composition::create(std::move(l), std::move(lt)); + } + + void iterate(); +}; + + +template +std::unique_ptr> +ParIct::generate_l_lt( + const std::shared_ptr &system_matrix) const +{ + using CsrMatrix = matrix::Csr; + + GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix); + // make sure no invalid parameters break our kernels! + GKO_ASSERT_EQ(parameters_.fill_in_limit > 0.0, true); + + const auto exec = this->get_executor(); + + // convert and/or sort the matrix if necessary + std::unique_ptr csr_system_matrix_unique_ptr{}; + auto csr_system_matrix = + dynamic_cast(system_matrix.get()); + if (csr_system_matrix == nullptr || + csr_system_matrix->get_executor() != exec) { + csr_system_matrix_unique_ptr = CsrMatrix::create(exec); + as>(system_matrix.get()) + ->convert_to(csr_system_matrix_unique_ptr.get()); + csr_system_matrix = csr_system_matrix_unique_ptr.get(); + } + if (!parameters_.skip_sorting) { + if (csr_system_matrix_unique_ptr == nullptr) { + csr_system_matrix_unique_ptr = CsrMatrix::create(exec); + csr_system_matrix_unique_ptr->copy_from(csr_system_matrix); + } + csr_system_matrix_unique_ptr->sort_by_column_index(); + csr_system_matrix = csr_system_matrix_unique_ptr.get(); + } + + // initialize the L matrix data structures + const auto num_rows = csr_system_matrix->get_size()[0]; + Array l_row_ptrs_array{exec, num_rows + 1}; + auto l_row_ptrs = l_row_ptrs_array.get_data(); + exec->run(make_initialize_row_ptrs_l(csr_system_matrix, l_row_ptrs)); + + auto l_nnz = + static_cast(exec->copy_val_to_host(l_row_ptrs + num_rows)); + + auto mtx_size = csr_system_matrix->get_size(); + auto l = CsrMatrix::create(exec, mtx_size, Array{exec, l_nnz}, + Array{exec, l_nnz}, + std::move(l_row_ptrs_array)); + + // initialize L + exec->run(make_initialize_l(csr_system_matrix, l.get(), true)); + + // compute limit #nnz for L + auto l_nnz_limit = + static_cast(l_nnz * parameters_.fill_in_limit); + + ParIctState state{exec, + csr_system_matrix, + std::move(l), + l_nnz_limit, + parameters_.approximate_select, + parameters_.l_strategy, + parameters_.lt_strategy}; + + for (size_type it = 0; it < parameters_.iterations; ++it) { + state.iterate(); + } + + return std::move(state).to_factors(); +} + + +template +void ParIctState::iterate() +{ + // compute L * L^H + exec->run(make_spgemm(l.get(), lt.get(), llt.get())); + + // add new candidates to L' factor + exec->run( + make_add_candidates(llt.get(), system_matrix, l.get(), l_new.get())); + + // update L(COO), L'^H sizes and pointers + { + auto l_nnz = l_new->get_num_stored_elements(); + CooBuilder l_builder{l_coo.get()}; + // resize arrays that will be filled + l_builder.get_row_idx_array().resize_and_reset(l_nnz); + // update arrays that will be aliased + l_builder.get_col_idx_array() = + Array::view(exec, l_nnz, l_new->get_col_idxs()); + l_builder.get_value_array() = + Array::view(exec, l_nnz, l_new->get_values()); + } + + // convert L into COO format + exec->run(make_convert_to_coo(l_new.get(), l_coo.get())); + + // execute asynchronous iteration + exec->run(make_compute_factor(system_matrix, l_new.get(), l_coo.get())); + + // determine ranks for selection/filtering + IndexType l_nnz = l_new->get_num_stored_elements(); + // make sure that the rank is in [0, *_nnz) + auto l_filter_rank = std::max(0, l_nnz - l_nnz_limit - 1); + if (use_approx_select) { + remove_complex tmp{}; + // remove approximately smallest candidates + exec->run(make_threshold_filter_approx(l_new.get(), l_filter_rank, + selection_tmp, tmp, l.get(), + l_coo.get())); + } else { + // select threshold to remove smallest candidates + remove_complex l_threshold{}; + exec->run(make_threshold_select(l_new.get(), l_filter_rank, + selection_tmp, selection_tmp2, + l_threshold)); + + // remove smallest candidates + exec->run(make_threshold_filter(l_new.get(), l_threshold, l.get(), + l_coo.get(), true)); + } + + // execute asynchronous iteration + exec->run(make_compute_factor(system_matrix, l.get(), l_coo.get())); + + // convert L to L^H + { + auto l_nnz = l->get_num_stored_elements(); + CsrBuilder lt_builder{lt.get()}; + lt_builder.get_col_idx_array().resize_and_reset(l_nnz); + lt_builder.get_value_array().resize_and_reset(l_nnz); + } + exec->run(make_csr_conj_transpose(l.get(), lt.get())); +} + + +#define GKO_DECLARE_PAR_ICT(ValueType, IndexType) \ + class ParIct +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ICT); + + +} // namespace factorization +} // namespace gko \ No newline at end of file diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp new file mode 100644 index 00000000000..f02b6ac7bb6 --- /dev/null +++ b/core/factorization/par_ict_kernels.hpp @@ -0,0 +1,116 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_ +#define GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_ + + +#include + + +#include + + +#include +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL(ValueType, IndexType) \ + void add_candidates(std::shared_ptr exec, \ + const matrix::Csr *llt, \ + const matrix::Csr *a, \ + const matrix::Csr *l, \ + matrix::Csr *l_new) + +#define GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL(ValueType, IndexType) \ + void compute_factor(std::shared_ptr exec, \ + const matrix::Csr *a, \ + matrix::Csr *l, \ + const matrix::Coo *l_coo) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL(ValueType, IndexType) + + +namespace omp { +namespace par_ict_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ict_factorization +} // namespace omp + + +namespace cuda { +namespace par_ict_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ict_factorization +} // namespace cuda + + +namespace reference { +namespace par_ict_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ict_factorization +} // namespace reference + + +namespace hip { +namespace par_ict_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ict_factorization +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_ diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp index 3d6763f7926..d61a27747af 100644 --- a/core/factorization/par_ilu.cpp +++ b/core/factorization/par_ilu.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/factorization/factorization_kernels.hpp" #include "core/factorization/par_ilu_kernels.hpp" #include "core/matrix/csr_kernels.hpp" @@ -53,9 +54,11 @@ namespace factorization { namespace par_ilu_factorization { +GKO_REGISTER_OPERATION(add_diagonal_elements, + factorization::add_diagonal_elements); GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u, - par_ilu_factorization::initialize_row_ptrs_l_u); -GKO_REGISTER_OPERATION(initialize_l_u, par_ilu_factorization::initialize_l_u); + factorization::initialize_row_ptrs_l_u); +GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u); GKO_REGISTER_OPERATION(compute_l_u_factors, par_ilu_factorization::compute_l_u_factors); GKO_REGISTER_OPERATION(csr_transpose, csr::transpose); @@ -67,7 +70,9 @@ GKO_REGISTER_OPERATION(csr_transpose, csr::transpose); template std::unique_ptr> ParIlu::generate_l_u( - const std::shared_ptr &system_matrix, bool skip_sorting) const + const std::shared_ptr &system_matrix, bool skip_sorting, + std::shared_ptr l_strategy, + std::shared_ptr u_strategy) const { using CsrMatrix = matrix::Csr; using CooMatrix = matrix::Coo; @@ -75,33 +80,22 @@ ParIlu::generate_l_u( GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix); const auto exec = this->get_executor(); - const auto host_exec = exec->get_master(); - - // If required, it is also possible to make this a Factory parameter - auto csr_strategy = std::make_shared(); - - // Only copies the matrix if it is not on the same executor or was not in - // the right format. Throws an exception if it is not convertable. - std::unique_ptr csr_system_matrix_unique_ptr{}; - auto csr_system_matrix = - dynamic_cast(system_matrix.get()); - if (csr_system_matrix == nullptr || - csr_system_matrix->get_executor() != exec) { - csr_system_matrix_unique_ptr = CsrMatrix::create(exec); - as>(system_matrix.get()) - ->convert_to(csr_system_matrix_unique_ptr.get()); - csr_system_matrix = csr_system_matrix_unique_ptr.get(); - } - // If it needs to be sorted, copy it if necessary and sort it + + // Converts the system matrix to CSR. + // Throws an exception if it is not convertible. + auto csr_system_matrix_unique_ptr = CsrMatrix::create(exec); + as>(system_matrix.get()) + ->convert_to(csr_system_matrix_unique_ptr.get()); + auto csr_system_matrix = csr_system_matrix_unique_ptr.get(); + // If necessary, sort it if (!skip_sorting) { - if (csr_system_matrix_unique_ptr == nullptr) { - csr_system_matrix_unique_ptr = CsrMatrix::create(exec); - csr_system_matrix_unique_ptr->copy_from(csr_system_matrix); - } - csr_system_matrix_unique_ptr->sort_by_column_index(); - csr_system_matrix = csr_system_matrix_unique_ptr.get(); + csr_system_matrix->sort_by_column_index(); } + // Add explicit diagonal zero elements if they are missing + exec->run(par_ilu_factorization::make_add_diagonal_elements( + csr_system_matrix, true)); + const auto matrix_size = csr_system_matrix->get_size(); const auto number_rows = matrix_size[0]; Array l_row_ptrs{exec, number_rows + 1}; @@ -109,15 +103,11 @@ ParIlu::generate_l_u( exec->run(par_ilu_factorization::make_initialize_row_ptrs_l_u( csr_system_matrix, l_row_ptrs.get_data(), u_row_ptrs.get_data())); - IndexType l_nnz_it; - IndexType u_nnz_it; - // Since nnz is always at row_ptrs[m], it can be extracted easily - host_exec->copy_from(exec.get(), 1, l_row_ptrs.get_data() + number_rows, - &l_nnz_it); - host_exec->copy_from(exec.get(), 1, u_row_ptrs.get_data() + number_rows, - &u_nnz_it); - auto l_nnz = static_cast(l_nnz_it); - auto u_nnz = static_cast(u_nnz_it); + // Get nnz from device memory + auto l_nnz = static_cast( + exec->copy_val_to_host(l_row_ptrs.get_data() + number_rows)); + auto u_nnz = static_cast( + exec->copy_val_to_host(u_row_ptrs.get_data() + number_rows)); // Since `row_ptrs` of L and U is already created, the matrix can be // directly created with it @@ -125,12 +115,12 @@ ParIlu::generate_l_u( Array l_vals{exec, l_nnz}; std::shared_ptr l_factor = l_matrix_type::create( exec, matrix_size, std::move(l_vals), std::move(l_col_idxs), - std::move(l_row_ptrs), csr_strategy); + std::move(l_row_ptrs), l_strategy); Array u_col_idxs{exec, u_nnz}; Array u_vals{exec, u_nnz}; std::shared_ptr u_factor = u_matrix_type::create( exec, matrix_size, std::move(u_vals), std::move(u_col_idxs), - std::move(u_row_ptrs), csr_strategy); + std::move(u_row_ptrs), u_strategy); exec->run(par_ilu_factorization::make_initialize_l_u( csr_system_matrix, l_factor.get(), u_factor.get())); @@ -173,8 +163,8 @@ ParIlu::generate_l_u( // Since the transposed version has the exact same non-zero positions // as `u_factor`, we can both skip the allocation and the `make_srow()` // call from CSR, leaving just the `transpose()` kernel call - exec->run(par_ilu_factorization::make_csr_transpose(u_factor.get(), - u_factor_transpose)); + exec->run(par_ilu_factorization::make_csr_transpose(u_factor_transpose, + u_factor.get())); return Composition::create(std::move(l_factor), std::move(u_factor)); diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp index df96ff0389f..09bc1dd2596 100644 --- a/core/factorization/par_ilu_kernels.hpp +++ b/core/factorization/par_ilu_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -48,19 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace kernels { - -#define GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \ - IndexType) \ - void initialize_row_ptrs_l_u( \ - std::shared_ptr exec, \ - const matrix::Csr *system_matrix, \ - IndexType *l_row_ptrs, IndexType *u_row_ptrs) -#define GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL(ValueType, IndexType) \ - void initialize_l_u( \ - std::shared_ptr exec, \ - const matrix::Csr *system_matrix, \ - matrix::Csr *l_factor, \ - matrix::Csr *u_factor) #define GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL(ValueType, IndexType) \ void compute_l_u_factors( \ std::shared_ptr exec, size_type iterations, \ @@ -69,12 +56,8 @@ namespace kernels { matrix::Csr *u_factor) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL(ValueType, IndexType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL(ValueType, IndexType) @@ -105,6 +88,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace par_ilu_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ilu_factorization +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp new file mode 100644 index 00000000000..1eb3dfeb950 --- /dev/null +++ b/core/factorization/par_ilut.cpp @@ -0,0 +1,355 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/factorization/par_ilu_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" + + +namespace gko { +namespace factorization { +namespace par_ilut_factorization { + + +GKO_REGISTER_OPERATION(threshold_select, + par_ilut_factorization::threshold_select); +GKO_REGISTER_OPERATION(threshold_filter, + par_ilut_factorization::threshold_filter); +GKO_REGISTER_OPERATION(threshold_filter_approx, + par_ilut_factorization::threshold_filter_approx); +GKO_REGISTER_OPERATION(add_candidates, par_ilut_factorization::add_candidates); +GKO_REGISTER_OPERATION(compute_l_u_factors, + par_ilut_factorization::compute_l_u_factors); + +GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u, + factorization::initialize_row_ptrs_l_u); +GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u); + +GKO_REGISTER_OPERATION(csr_transpose, csr::transpose); +GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); +GKO_REGISTER_OPERATION(spgemm, csr::spgemm); + + +} // namespace par_ilut_factorization + + +using par_ilut_factorization::make_add_candidates; +using par_ilut_factorization::make_compute_l_u_factors; +using par_ilut_factorization::make_convert_to_coo; +using par_ilut_factorization::make_csr_transpose; +using par_ilut_factorization::make_initialize_l_u; +using par_ilut_factorization::make_initialize_row_ptrs_l_u; +using par_ilut_factorization::make_spgemm; +using par_ilut_factorization::make_threshold_filter; +using par_ilut_factorization::make_threshold_filter_approx; +using par_ilut_factorization::make_threshold_select; + + +template +struct ParIlutState { + using CsrMatrix = matrix::Csr; + using CooMatrix = matrix::Coo; + using CsrBuilder = matrix::CsrBuilder; + using CooBuilder = matrix::CooBuilder; + using Scalar = matrix::Dense; + // the executor on which the kernels are being executed + std::shared_ptr exec; + // max number of non-zeros L is supposed to have + IndexType l_nnz_limit; + // max number of non-zeros U is supposed to have + IndexType u_nnz_limit; + // use the approximate selection/filter kernels? + bool use_approx_select; + // system matrix A + const CsrMatrix *system_matrix; + // current lower factor L + std::unique_ptr l; + // current upper factor U + std::unique_ptr u; + // current upper factor U in CSC format + std::unique_ptr u_csc; + // current product L * U + std::unique_ptr lu; + // temporary lower factor L' before filtering + std::unique_ptr l_new; + // temporary upper factor U' before filtering + std::unique_ptr u_new; + // temporary upper factor U' in CSC format before filtering + std::unique_ptr u_new_csc; + // lower factor L currently being updated with asynchronous iterations + std::unique_ptr l_coo; + // upper factor U currently being updated + std::unique_ptr u_coo; + // temporary array for threshold selection + Array selection_tmp; + // temporary array for threshold selection + Array> selection_tmp2; + // strategy to be used by the lower factor + std::shared_ptr l_strategy; + // strategy to be used by the upper factor + std::shared_ptr u_strategy; + + ParIlutState(std::shared_ptr exec_in, + const CsrMatrix *system_matrix_in, + std::unique_ptr l_in, + std::unique_ptr u_in, IndexType l_nnz_limit, + IndexType u_nnz_limit, bool use_approx_select, + std::shared_ptr l_strategy_, + std::shared_ptr u_strategy_) + : exec{std::move(exec_in)}, + l_nnz_limit{l_nnz_limit}, + u_nnz_limit{u_nnz_limit}, + use_approx_select{use_approx_select}, + system_matrix{system_matrix_in}, + l{std::move(l_in)}, + u{std::move(u_in)}, + selection_tmp{exec}, + selection_tmp2{exec}, + l_strategy{std::move(l_strategy_)}, + u_strategy{std::move(u_strategy_)} + { + auto mtx_size = system_matrix->get_size(); + auto u_nnz = u->get_num_stored_elements(); + u_csc = CsrMatrix::create(exec, mtx_size, u_nnz); + lu = CsrMatrix::create(exec, mtx_size); + l_new = CsrMatrix::create(exec, mtx_size); + u_new = CsrMatrix::create(exec, mtx_size); + u_new_csc = CsrMatrix::create(exec, mtx_size); + l_coo = CooMatrix::create(exec, mtx_size); + u_coo = CooMatrix::create(exec, mtx_size); + exec->run(make_csr_transpose(u.get(), u_csc.get())); + } + + std::unique_ptr> to_factors() && + { + l->set_strategy(l_strategy); + u->set_strategy(u_strategy); + return Composition::create(std::move(l), std::move(u)); + } + + void iterate(); +}; + + +template +std::unique_ptr> +ParIlut::generate_l_u( + const std::shared_ptr &system_matrix) const +{ + using CsrMatrix = matrix::Csr; + + GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix); + // make sure no invalid parameters break our kernels! + GKO_ASSERT_EQ(parameters_.fill_in_limit > 0.0, true); + + const auto exec = this->get_executor(); + + // convert and/or sort the matrix if necessary + std::unique_ptr csr_system_matrix_unique_ptr{}; + auto csr_system_matrix = + dynamic_cast(system_matrix.get()); + if (csr_system_matrix == nullptr || + csr_system_matrix->get_executor() != exec) { + csr_system_matrix_unique_ptr = CsrMatrix::create(exec); + as>(system_matrix.get()) + ->convert_to(csr_system_matrix_unique_ptr.get()); + csr_system_matrix = csr_system_matrix_unique_ptr.get(); + } + if (!parameters_.skip_sorting) { + if (csr_system_matrix_unique_ptr == nullptr) { + csr_system_matrix_unique_ptr = CsrMatrix::create(exec); + csr_system_matrix_unique_ptr->copy_from(csr_system_matrix); + } + csr_system_matrix_unique_ptr->sort_by_column_index(); + csr_system_matrix = csr_system_matrix_unique_ptr.get(); + } + + // initialize the L and U matrix data structures + const auto num_rows = csr_system_matrix->get_size()[0]; + Array l_row_ptrs_array{exec, num_rows + 1}; + Array u_row_ptrs_array{exec, num_rows + 1}; + auto l_row_ptrs = l_row_ptrs_array.get_data(); + auto u_row_ptrs = u_row_ptrs_array.get_data(); + exec->run(make_initialize_row_ptrs_l_u(csr_system_matrix, l_row_ptrs, + u_row_ptrs)); + + auto l_nnz = + static_cast(exec->copy_val_to_host(l_row_ptrs + num_rows)); + auto u_nnz = + static_cast(exec->copy_val_to_host(u_row_ptrs + num_rows)); + + auto mtx_size = csr_system_matrix->get_size(); + auto l = CsrMatrix::create(exec, mtx_size, Array{exec, l_nnz}, + Array{exec, l_nnz}, + std::move(l_row_ptrs_array)); + auto u = CsrMatrix::create(exec, mtx_size, Array{exec, u_nnz}, + Array{exec, u_nnz}, + std::move(u_row_ptrs_array)); + + // initialize L and U + exec->run(make_initialize_l_u(csr_system_matrix, l.get(), u.get())); + + // compute limit #nnz for L and U + auto l_nnz_limit = + static_cast(l_nnz * parameters_.fill_in_limit); + auto u_nnz_limit = + static_cast(u_nnz * parameters_.fill_in_limit); + + ParIlutState state{exec, + csr_system_matrix, + std::move(l), + std::move(u), + l_nnz_limit, + u_nnz_limit, + parameters_.approximate_select, + parameters_.l_strategy, + parameters_.u_strategy}; + + for (size_type it = 0; it < parameters_.iterations; ++it) { + state.iterate(); + } + + return std::move(state).to_factors(); +} + + +template +void ParIlutState::iterate() +{ + // compute L * U + exec->run(make_spgemm(l.get(), u.get(), lu.get())); + + // add new candidates to L' and U' factors + exec->run(make_add_candidates(lu.get(), system_matrix, l.get(), u.get(), + l_new.get(), u_new.get())); + + // update U'(CSC), L'(COO), U'(COO) sizes and pointers + { + auto l_nnz = l_new->get_num_stored_elements(); + auto u_nnz = u_new->get_num_stored_elements(); + CooBuilder l_builder{l_coo.get()}; + CooBuilder u_builder{u_coo.get()}; + CsrBuilder u_csc_builder{u_new_csc.get()}; + // resize arrays that will be filled + l_builder.get_row_idx_array().resize_and_reset(l_nnz); + u_builder.get_row_idx_array().resize_and_reset(u_nnz); + u_csc_builder.get_col_idx_array().resize_and_reset(u_nnz); + u_csc_builder.get_value_array().resize_and_reset(u_nnz); + // update arrays that will be aliased + l_builder.get_col_idx_array() = + Array::view(exec, l_nnz, l_new->get_col_idxs()); + u_builder.get_col_idx_array() = + Array::view(exec, u_nnz, u_new->get_col_idxs()); + l_builder.get_value_array() = + Array::view(exec, l_nnz, l_new->get_values()); + u_builder.get_value_array() = + Array::view(exec, u_nnz, u_new->get_values()); + } + + // convert U' into CSC format + exec->run(make_csr_transpose(u_new.get(), u_new_csc.get())); + + // convert L' and U' into COO format + exec->run(make_convert_to_coo(l_new.get(), l_coo.get())); + exec->run(make_convert_to_coo(u_new.get(), u_coo.get())); + + // execute asynchronous iteration + exec->run(make_compute_l_u_factors(system_matrix, l_new.get(), l_coo.get(), + u_new.get(), u_coo.get(), + u_new_csc.get())); + + // determine ranks for selection/filtering + IndexType l_nnz = l_new->get_num_stored_elements(); + IndexType u_nnz = u_new->get_num_stored_elements(); + // make sure that the rank is in [0, *_nnz) + auto l_filter_rank = std::max(0, l_nnz - l_nnz_limit - 1); + auto u_filter_rank = std::max(0, u_nnz - u_nnz_limit - 1); + remove_complex l_threshold{}; + remove_complex u_threshold{}; + CooMatrix *null_coo = nullptr; + if (use_approx_select) { + // remove approximately smallest candidates from L' and U'^T + exec->run(make_threshold_filter_approx(l_new.get(), l_filter_rank, + selection_tmp, l_threshold, + l.get(), l_coo.get())); + exec->run(make_threshold_filter_approx(u_new_csc.get(), u_filter_rank, + selection_tmp, u_threshold, + u_csc.get(), null_coo)); + } else { + // select threshold to remove smallest candidates + exec->run(make_threshold_select(l_new.get(), l_filter_rank, + selection_tmp, selection_tmp2, + l_threshold)); + exec->run(make_threshold_select(u_new_csc.get(), u_filter_rank, + selection_tmp, selection_tmp2, + u_threshold)); + + // remove smallest candidates from L' and U'^T + exec->run(make_threshold_filter(l_new.get(), l_threshold, l.get(), + l_coo.get(), true)); + exec->run(make_threshold_filter(u_new_csc.get(), u_threshold, + u_csc.get(), null_coo, true)); + } + // remove smallest candidates from U' + exec->run(make_threshold_filter(u_new.get(), u_threshold, u.get(), + u_coo.get(), false)); + + // execute asynchronous iteration + exec->run(make_compute_l_u_factors(system_matrix, l.get(), l_coo.get(), + u.get(), u_coo.get(), u_csc.get())); +} + + +#define GKO_DECLARE_PAR_ILUT(ValueType, IndexType) \ + class ParIlut +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PAR_ILUT); + + +} // namespace factorization +} // namespace gko \ No newline at end of file diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp new file mode 100644 index 00000000000..9bb19596c3f --- /dev/null +++ b/core/factorization/par_ilut_kernels.hpp @@ -0,0 +1,153 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_ +#define GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_ + + +#include + + +#include + + +#include +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL(ValueType, IndexType) \ + void add_candidates(std::shared_ptr exec, \ + const matrix::Csr *lu, \ + const matrix::Csr *a, \ + const matrix::Csr *l, \ + const matrix::Csr *u, \ + matrix::Csr *l_new, \ + matrix::Csr *u_new) + +#define GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL(ValueType, IndexType) \ + void compute_l_u_factors(std::shared_ptr exec, \ + const matrix::Csr *a, \ + matrix::Csr *l, \ + const matrix::Coo *l_coo, \ + matrix::Csr *u, \ + const matrix::Coo *u_coo, \ + matrix::Csr *u_csc) + +#define GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType) \ + void threshold_select(std::shared_ptr exec, \ + const matrix::Csr *m, \ + IndexType rank, Array &tmp, \ + Array> &tmp2, \ + remove_complex &threshold) + +#define GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType) \ + void threshold_filter(std::shared_ptr exec, \ + const matrix::Csr *m, \ + remove_complex threshold, \ + matrix::Csr *m_out, \ + matrix::Coo *m_out_coo, \ + bool lower) + +#define GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL(ValueType, \ + IndexType) \ + void threshold_filter_approx(std::shared_ptr exec, \ + const matrix::Csr *m, \ + IndexType rank, Array &tmp, \ + remove_complex &threshold, \ + matrix::Csr *m_out, \ + matrix::Coo *m_out_coo) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + constexpr auto sampleselect_searchtree_height = 8; \ + constexpr auto sampleselect_oversampling = 4; \ + template \ + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL(ValueType, IndexType) + + +namespace omp { +namespace par_ilut_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ilut_factorization +} // namespace omp + + +namespace cuda { +namespace par_ilut_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ilut_factorization +} // namespace cuda + + +namespace reference { +namespace par_ilut_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ilut_factorization +} // namespace reference + + +namespace hip { +namespace par_ilut_factorization { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace par_ilut_factorization +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_ diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp index 4cf6ed742d6..9947e40fc60 100644 --- a/core/log/convergence.cpp +++ b/core/log/convergence.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,11 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include #include +#include #include #include @@ -60,7 +60,8 @@ void Convergence::on_criterion_check_completed( this->residual_norm_.reset(residual_norm->clone().release()); } else if (residual != nullptr) { using Vector = matrix::Dense; - this->residual_norm_ = Vector::create( + using NormVector = matrix::Dense>; + this->residual_norm_ = NormVector::create( residual->get_executor(), dim<2>{1, residual->get_size()[1]}); auto dense_r = as(residual); dense_r->compute_norm2(this->residual_norm_.get()); diff --git a/core/log/logger.cpp b/core/log/logger.cpp index 75f48036937..46ee98b2895 100644 --- a/core/log/logger.cpp +++ b/core/log/logger.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include @@ -75,5 +74,6 @@ constexpr Logger::mask_type Logger::criterion_check_completed_mask; constexpr Logger::mask_type Logger::iteration_complete_mask; + } // namespace log } // namespace gko diff --git a/core/log/papi.cpp b/core/log/papi.cpp index 50da5bd11c9..1c8a17419fa 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include @@ -42,10 +41,6 @@ namespace gko { namespace log { -template -size_type Papi::logger_count = 0; - - template void Papi::on_allocation_started(const Executor *exec, const size_type &num_bytes) const diff --git a/core/log/record.cpp b/core/log/record.cpp index 19c0992e2a6..48026c1563b 100644 --- a/core/log/record.cpp +++ b/core/log/record.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include diff --git a/core/log/stream.cpp b/core/log/stream.cpp index 3ffd3c11e25..3cad7421aee 100644 --- a/core/log/stream.cpp +++ b/core/log/stream.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include @@ -39,14 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include -#include - - namespace gko { namespace log { diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 73d001d6dba..2cd8f34982f 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,23 +31,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include -#include + + +#include +#include #include #include #include #include +#include #include #include "core/matrix/coo_kernels.hpp" -#include -#include - - namespace gko { namespace matrix { @@ -103,6 +103,25 @@ void Coo::apply2_impl(const LinOp *alpha, const LinOp *b, } +template +void Coo::convert_to( + Coo, IndexType> *result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo, IndexType> *result) +{ + this->convert_to(result); +} + + template void Coo::convert_to( Csr *result) const @@ -113,7 +132,7 @@ void Coo::convert_to( result->get_strategy()); tmp->values_ = this->values_; tmp->col_idxs_ = this->col_idxs_; - exec->run(coo::make_convert_to_csr(tmp.get(), this)); + exec->run(coo::make_convert_to_csr(this, tmp.get())); tmp->make_srow(); tmp->move_to(result); } @@ -128,7 +147,7 @@ void Coo::move_to(Csr *result) result->get_strategy()); tmp->values_ = std::move(this->values_); tmp->col_idxs_ = std::move(this->col_idxs_); - exec->run(coo::make_convert_to_csr(tmp.get(), this)); + exec->run(coo::make_convert_to_csr(this, tmp.get())); tmp->make_srow(); tmp->move_to(result); } @@ -139,7 +158,7 @@ void Coo::convert_to(Dense *result) const { auto exec = this->get_executor(); auto tmp = Dense::create(exec, this->get_size()); - exec->run(coo::make_convert_to_dense(tmp.get(), this)); + exec->run(coo::make_convert_to_dense(this, tmp.get())); tmp->move_to(result); } diff --git a/core/matrix/coo_builder.hpp b/core/matrix/coo_builder.hpp new file mode 100644 index 00000000000..de323ad42b6 --- /dev/null +++ b/core/matrix/coo_builder.hpp @@ -0,0 +1,89 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_COO_BUILDER_HPP_ +#define GKO_CORE_MATRIX_COO_BUILDER_HPP_ + + +#include + + +namespace gko { +namespace matrix { + + +/** + * @internal + * + * Allows intrusive access to the arrays stored within a @ref Coo matrix. + * + * @tparam ValueType the value type of the matrix + * @tparam IndexType the index type of the matrix + */ +template +class CooBuilder { +public: + /** + * Returns the row index array of the COO matrix. + */ + Array &get_row_idx_array() { return matrix_->row_idxs_; } + + /** + * Returns the column index array of the COO matrix. + */ + Array &get_col_idx_array() { return matrix_->col_idxs_; } + + /** + * Returns the value array of the COO matrix. + */ + Array &get_value_array() { return matrix_->values_; } + + /** + * Initializes a CooBuilder from an existing COO matrix. + */ + explicit CooBuilder(Coo *matrix) : matrix_{matrix} {} + + // make this type non-movable + CooBuilder(const CooBuilder &) = delete; + CooBuilder(CooBuilder &&) = delete; + CooBuilder &operator=(const CooBuilder &) = delete; + CooBuilder &operator=(CooBuilder &&) = delete; + +private: + Coo *matrix_; +}; + + +} // namespace matrix +} // namespace gko + +#endif // GKO_CORE_MATRIX_COO_BUILDER_HPP_ diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index e8508ac1c8c..48a83f8f9b1 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_COO_KERNELS_HPP_ -#include #include + + +#include #include #include @@ -69,15 +71,15 @@ namespace kernels { const matrix::Dense *b, \ matrix::Dense *c) -#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - matrix::Dense *result, \ - const matrix::Coo *source) +#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ + void convert_to_dense(std::shared_ptr exec, \ + const matrix::Coo *source, \ + matrix::Dense *result) -#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ - void convert_to_csr(std::shared_ptr exec, \ - matrix::Csr *result, \ - const matrix::Coo *source) +#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ + void convert_to_csr(std::shared_ptr exec, \ + const matrix::Coo *source, \ + matrix::Csr *result) #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ @@ -121,6 +123,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace coo { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace coo +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 53dd2f02b12..2d0012c7cb6 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -40,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -54,6 +56,9 @@ namespace csr { GKO_REGISTER_OPERATION(spmv, csr::spmv); GKO_REGISTER_OPERATION(advanced_spmv, csr::advanced_spmv); +GKO_REGISTER_OPERATION(spgemm, csr::spgemm); +GKO_REGISTER_OPERATION(advanced_spgemm, csr::advanced_spgemm); +GKO_REGISTER_OPERATION(spgeam, csr::spgeam); GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); GKO_REGISTER_OPERATION(convert_to_dense, csr::convert_to_dense); GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp); @@ -62,6 +67,10 @@ GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell); GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid); GKO_REGISTER_OPERATION(transpose, csr::transpose); GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose); +GKO_REGISTER_OPERATION(row_permute, csr::row_permute); +GKO_REGISTER_OPERATION(column_permute, csr::column_permute); +GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute); +GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute); GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, csr::calculate_max_nnz_per_row); GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, @@ -78,7 +87,16 @@ template void Csr::apply_impl(const LinOp *b, LinOp *x) const { using Dense = Dense; - this->get_executor()->run(csr::make_spmv(this, as(b), as(x))); + using TCsr = Csr; + if (auto b_csr = dynamic_cast(b)) { + // if b is a CSR matrix, we compute a SpGeMM + auto x_csr = as(x); + this->get_executor()->run(csr::make_spgemm(this, b_csr, x_csr)); + } else { + // otherwise we assume that b is dense and compute a SpMV/SpMM + this->get_executor()->run( + csr::make_spmv(this, as(b), as(x))); + } } @@ -87,8 +105,46 @@ void Csr::apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, LinOp *x) const { using Dense = Dense; - this->get_executor()->run(csr::make_advanced_spmv( - as(alpha), this, as(b), as(beta), as(x))); + using TCsr = Csr; + if (auto b_csr = dynamic_cast(b)) { + // if b is a CSR matrix, we compute a SpGeMM + auto x_csr = as(x); + auto x_copy = x_csr->clone(); + this->get_executor()->run( + csr::make_advanced_spgemm(as(alpha), this, b_csr, + as(beta), x_copy.get(), x_csr)); + } else if (dynamic_cast *>(b)) { + // if b is an identity matrix, we compute an SpGEAM + auto x_csr = as(x); + auto x_copy = x_csr->clone(); + this->get_executor()->run(csr::make_spgeam( + as(alpha), this, as(beta), lend(x_copy), x_csr)); + } else { + // otherwise we assume that b is dense and compute a SpMV/SpMM + this->get_executor()->run( + csr::make_advanced_spmv(as(alpha), this, as(b), + as(beta), as(x))); + } +} + + +template +void Csr::convert_to( + Csr, IndexType> *result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr, IndexType> *result) +{ + this->convert_to(result); } @@ -101,7 +157,7 @@ void Csr::convert_to( exec, this->get_size(), this->get_num_stored_elements()); tmp->values_ = this->values_; tmp->col_idxs_ = this->col_idxs_; - exec->run(csr::make_convert_to_coo(tmp.get(), this)); + exec->run(csr::make_convert_to_coo(this, tmp.get())); tmp->move_to(result); } @@ -118,7 +174,7 @@ void Csr::convert_to(Dense *result) const { auto exec = this->get_executor(); auto tmp = Dense::create(exec, this->get_size()); - exec->run(csr::make_convert_to_dense(tmp.get(), this)); + exec->run(csr::make_convert_to_dense(this, tmp.get())); tmp->move_to(result); } @@ -148,7 +204,7 @@ void Csr::convert_to( auto tmp = Hybrid::create( exec, this->get_size(), max_nnz_per_row, stride, coo_nnz, result->get_strategy()); - exec->run(csr::make_convert_to_hybrid(tmp.get(), this)); + exec->run(csr::make_convert_to_hybrid(this, tmp.get())); tmp->move_to(result); } @@ -176,7 +232,7 @@ void Csr::convert_to( slice_size)); auto tmp = Sellp::create( exec, this->get_size(), slice_size, stride_factor, total_cols); - exec->run(csr::make_convert_to_sellp(tmp.get(), this)); + exec->run(csr::make_convert_to_sellp(this, tmp.get())); tmp->move_to(result); } @@ -223,7 +279,7 @@ void Csr::convert_to( exec->run(csr::make_calculate_max_nnz_per_row(this, &max_nnz_per_row)); auto tmp = Ell::create(exec, this->get_size(), max_nnz_per_row); - exec->run(csr::make_convert_to_ell(tmp.get(), this)); + exec->run(csr::make_convert_to_ell(this, tmp.get())); tmp->move_to(result); } @@ -300,7 +356,7 @@ std::unique_ptr Csr::transpose() const Csr::create(exec, gko::transpose(this->get_size()), this->get_num_stored_elements(), this->get_strategy()); - exec->run(csr::make_transpose(trans_cpy.get(), this)); + exec->run(csr::make_transpose(this, trans_cpy.get())); trans_cpy->make_srow(); return std::move(trans_cpy); } @@ -314,12 +370,82 @@ std::unique_ptr Csr::conj_transpose() const Csr::create(exec, gko::transpose(this->get_size()), this->get_num_stored_elements(), this->get_strategy()); - exec->run(csr::make_conj_transpose(trans_cpy.get(), this)); + exec->run(csr::make_conj_transpose(this, trans_cpy.get())); trans_cpy->make_srow(); return std::move(trans_cpy); } +template +std::unique_ptr Csr::row_permute( + const Array *permutation_indices) const +{ + GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); + auto exec = this->get_executor(); + auto permute_cpy = + Csr::create(exec, this->get_size(), this->get_num_stored_elements(), + this->get_strategy()); + + exec->run( + csr::make_row_permute(permutation_indices, this, permute_cpy.get())); + permute_cpy->make_srow(); + return std::move(permute_cpy); +} + + +template +std::unique_ptr Csr::column_permute( + const Array *permutation_indices) const +{ + GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); + auto exec = this->get_executor(); + auto permute_cpy = + Csr::create(exec, this->get_size(), this->get_num_stored_elements(), + this->get_strategy()); + + exec->run( + csr::make_column_permute(permutation_indices, this, permute_cpy.get())); + permute_cpy->make_srow(); + return std::move(permute_cpy); +} + + +template +std::unique_ptr Csr::inverse_row_permute( + const Array *inverse_permutation_indices) const +{ + GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(), + this->get_size()[0]); + auto exec = this->get_executor(); + auto inverse_permute_cpy = + Csr::create(exec, this->get_size(), this->get_num_stored_elements(), + this->get_strategy()); + + exec->run(csr::make_inverse_row_permute(inverse_permutation_indices, this, + inverse_permute_cpy.get())); + inverse_permute_cpy->make_srow(); + return std::move(inverse_permute_cpy); +} + + +template +std::unique_ptr Csr::inverse_column_permute( + const Array *inverse_permutation_indices) const +{ + GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(), + this->get_size()[1]); + auto exec = this->get_executor(); + auto inverse_permute_cpy = + Csr::create(exec, this->get_size(), this->get_num_stored_elements(), + this->get_strategy()); + + exec->run(csr::make_inverse_column_permute( + inverse_permutation_indices, this, inverse_permute_cpy.get())); + inverse_permute_cpy->make_srow(); + return std::move(inverse_permute_cpy); +} + + template void Csr::sort_by_column_index() { diff --git a/core/matrix/csr_builder.hpp b/core/matrix/csr_builder.hpp new file mode 100644 index 00000000000..73f892dc3a8 --- /dev/null +++ b/core/matrix/csr_builder.hpp @@ -0,0 +1,89 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_CSR_BUILDER_HPP_ +#define GKO_CORE_MATRIX_CSR_BUILDER_HPP_ + + +#include + + +namespace gko { +namespace matrix { + + +/** + * @internal + * + * Allows intrusive access to the arrays stored within a @ref Csr matrix. + * + * @tparam ValueType the value type of the matrix + * @tparam IndexType the index type of the matrix + */ +template +class CsrBuilder { +public: + /** + * Returns the column index array of the CSR matrix. + */ + Array &get_col_idx_array() { return matrix_->col_idxs_; } + + /** + * Returns the value array of the CSR matrix. + */ + Array &get_value_array() { return matrix_->values_; } + + /** + * Initializes a CsrBuilder from an existing CSR matrix. + */ + explicit CsrBuilder(Csr *matrix) : matrix_{matrix} {} + + /** + * Updates the internal matrix data structures at destruction. + */ + ~CsrBuilder() { matrix_->make_srow(); } + + // make this type non-movable + CsrBuilder(const CsrBuilder &) = delete; + CsrBuilder(CsrBuilder &&) = delete; + CsrBuilder &operator=(const CsrBuilder &) = delete; + CsrBuilder &operator=(CsrBuilder &&) = delete; + +private: + Csr *matrix_; +}; + + +} // namespace matrix +} // namespace gko + +#endif // GKO_CORE_MATRIX_CSR_BUILDER_HPP_ diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 2e099594e70..f901c0f0952 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,9 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_CSR_KERNELS_HPP_ +#include + + +#include #include #include -#include #include #include #include @@ -61,30 +64,53 @@ namespace kernels { const matrix::Dense *beta, \ matrix::Dense *c) -#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - matrix::Dense *result, \ - const matrix::Csr *source) - -#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType) \ - void convert_to_coo(std::shared_ptr exec, \ - matrix::Coo *result, \ - const matrix::Csr *source) - -#define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType) \ - void convert_to_ell(std::shared_ptr exec, \ - matrix::Ell *result, \ - const matrix::Csr *source) - -#define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType) \ - void convert_to_hybrid(std::shared_ptr exec, \ - matrix::Hybrid *result, \ - const matrix::Csr *source) - -#define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType) \ - void convert_to_sellp(std::shared_ptr exec, \ - matrix::Sellp *result, \ - const matrix::Csr *source) +#define GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType) \ + void spgemm(std::shared_ptr exec, \ + const matrix::Csr *a, \ + const matrix::Csr *b, \ + matrix::Csr *c) + +#define GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType) \ + void advanced_spgemm(std::shared_ptr exec, \ + const matrix::Dense *alpha, \ + const matrix::Csr *a, \ + const matrix::Csr *b, \ + const matrix::Dense *beta, \ + const matrix::Csr *d, \ + matrix::Csr *c) + +#define GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType) \ + void spgeam(std::shared_ptr exec, \ + const matrix::Dense *alpha, \ + const matrix::Csr *a, \ + const matrix::Dense *beta, \ + const matrix::Csr *b, \ + matrix::Csr *c) + +#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ + void convert_to_dense(std::shared_ptr exec, \ + const matrix::Csr *source, \ + matrix::Dense *result) + +#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType) \ + void convert_to_coo(std::shared_ptr exec, \ + const matrix::Csr *source, \ + matrix::Coo *result) + +#define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType) \ + void convert_to_ell(std::shared_ptr exec, \ + const matrix::Csr *source, \ + matrix::Ell *result) + +#define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType) \ + void convert_to_hybrid(std::shared_ptr exec, \ + const matrix::Csr *source, \ + matrix::Hybrid *result) + +#define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType) \ + void convert_to_sellp(std::shared_ptr exec, \ + const matrix::Csr *source, \ + matrix::Sellp *result) #define GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL(ValueType, IndexType) \ void calculate_total_cols(std::shared_ptr exec, \ @@ -92,15 +118,40 @@ namespace kernels { size_type *result, size_type stride_factor, \ size_type slice_size) -#define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \ - void transpose(std::shared_ptr exec, \ - matrix::Csr *trans, \ - const matrix::Csr *orig) - -#define GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType) \ - void conj_transpose(std::shared_ptr exec, \ - matrix::Csr *trans, \ - const matrix::Csr *orig) +#define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \ + void transpose(std::shared_ptr exec, \ + const matrix::Csr *orig, \ + matrix::Csr *trans) + +#define GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType) \ + void conj_transpose(std::shared_ptr exec, \ + const matrix::Csr *orig, \ + matrix::Csr *trans) + +#define GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType) \ + void row_permute(std::shared_ptr exec, \ + const Array *permutation_indices, \ + const matrix::Csr *orig, \ + matrix::Csr *row_permuted) + +#define GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \ + void column_permute(std::shared_ptr exec, \ + const Array *permutation_indices, \ + const matrix::Csr *orig, \ + matrix::Csr *column_permuted) + +#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType) \ + void inverse_row_permute(std::shared_ptr exec, \ + const Array *permutation_indices, \ + const matrix::Csr *orig, \ + matrix::Csr *row_permuted) + +#define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \ + void inverse_column_permute( \ + std::shared_ptr exec, \ + const Array *permutation_indices, \ + const matrix::Csr *orig, \ + matrix::Csr *column_permuted) #define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \ void calculate_max_nnz_per_row( \ @@ -113,6 +164,7 @@ namespace kernels { std::shared_ptr exec, \ const matrix::Csr *source, \ Array *result) + #define GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType) \ void sort_by_column_index(std::shared_ptr exec, \ matrix::Csr *to_sort) @@ -128,6 +180,12 @@ namespace kernels { template \ GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType); \ @@ -144,6 +202,14 @@ namespace kernels { template \ GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ @@ -180,6 +246,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace csr { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace csr +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index b284593d21a..d7ba31b77ad 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + +#include #include #include #include @@ -49,9 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" -#include - - namespace gko { namespace matrix { namespace dense { @@ -71,6 +72,10 @@ GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, GKO_REGISTER_OPERATION(calculate_total_cols, dense::calculate_total_cols); GKO_REGISTER_OPERATION(transpose, dense::transpose); GKO_REGISTER_OPERATION(conj_transpose, dense::conj_transpose); +GKO_REGISTER_OPERATION(row_permute, dense::row_permute); +GKO_REGISTER_OPERATION(column_permute, dense::column_permute); +GKO_REGISTER_OPERATION(inverse_row_permute, dense::inverse_row_permute); +GKO_REGISTER_OPERATION(inverse_column_permute, dense::inverse_column_permute); GKO_REGISTER_OPERATION(convert_to_coo, dense::convert_to_coo); GKO_REGISTER_OPERATION(convert_to_csr, dense::convert_to_csr); GKO_REGISTER_OPERATION(convert_to_ell, dense::convert_to_ell); @@ -96,7 +101,7 @@ inline void conversion_helper(Coo *result, exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); auto tmp = Coo::create(exec, source->get_size(), num_stored_nonzeros); - exec->run(op(tmp.get(), source)); + exec->run(op(source, tmp.get())); tmp->move_to(result); } @@ -108,21 +113,12 @@ inline void conversion_helper(Csr *result, { auto exec = source->get_executor(); - if (source->get_size()) { - size_type num_stored_nonzeros = 0; - exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); - auto tmp = Csr::create(exec, source->get_size(), - num_stored_nonzeros, - result->get_strategy()); - exec->run(op(tmp.get(), source)); - tmp->move_to(result); - } - // If source is empty, there is no need to copy data or to call kernels - else { - auto tmp = - Csr::create(exec, result->get_strategy()); - tmp->move_to(result); - } + size_type num_stored_nonzeros = 0; + exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); + auto tmp = Csr::create( + exec, source->get_size(), num_stored_nonzeros, result->get_strategy()); + exec->run(op(source, tmp.get())); + tmp->move_to(result); } @@ -140,7 +136,7 @@ inline void conversion_helper(Ell *result, const auto stride = std::max(result->get_stride(), source->get_size()[0]); auto tmp = Ell::create(exec, source->get_size(), max_nnz_per_row, stride); - exec->run(op(tmp.get(), source)); + exec->run(op(source, tmp.get())); tmp->move_to(result); } @@ -165,7 +161,7 @@ inline void conversion_helper(Hybrid *result, auto tmp = Hybrid::create( exec, source->get_size(), max_nnz_per_row, stride, coo_nnz, result->get_strategy()); - exec->run(op(tmp.get(), source)); + exec->run(op(source, tmp.get())); tmp->move_to(result); } @@ -187,7 +183,7 @@ inline void conversion_helper(Sellp *result, stride_factor, slice_size)); auto tmp = Sellp::create( exec, source->get_size(), slice_size, stride_factor, total_cols); - exec->run(op(tmp.get(), source)); + exec->run(op(source, tmp.get())); tmp->move_to(result); } @@ -203,7 +199,7 @@ inline void conversion_helper(SparsityCsr *result, exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); auto tmp = SparsityCsr::create( exec, source->get_size(), num_stored_nonzeros); - exec->run(op(tmp.get(), source)); + exec->run(op(source, tmp.get())); tmp->move_to(result); } @@ -271,10 +267,28 @@ void Dense::compute_dot_impl(const LinOp *b, LinOp *result) const template void Dense::compute_norm2_impl(LinOp *result) const { + using NormVector = Dense>; GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); exec->run(dense::make_compute_norm2(as>(this), - as>(result))); + as(result))); +} + + +template +void Dense::convert_to( + Dense> *result) const +{ + result->values_ = this->values_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Dense::move_to(Dense> *result) +{ + this->convert_to(result); } @@ -283,8 +297,8 @@ void Dense::convert_to(Coo *result) const { conversion_helper( result, this, - dense::template make_convert_to_coo *&>); + dense::template make_convert_to_coo *&, + decltype(result)>); } @@ -300,8 +314,8 @@ void Dense::convert_to(Coo *result) const { conversion_helper( result, this, - dense::template make_convert_to_coo *&>); + dense::template make_convert_to_coo *&, + decltype(result)>); } @@ -317,8 +331,8 @@ void Dense::convert_to(Csr *result) const { conversion_helper( result, this, - dense::template make_convert_to_csr *&>); + dense::template make_convert_to_csr *&, + decltype(result)>); result->make_srow(); } @@ -335,8 +349,8 @@ void Dense::convert_to(Csr *result) const { conversion_helper( result, this, - dense::template make_convert_to_csr *&>); + dense::template make_convert_to_csr *&, + decltype(result)>); result->make_srow(); } @@ -353,8 +367,8 @@ void Dense::convert_to(Ell *result) const { conversion_helper( result, this, - dense::template make_convert_to_ell *&>); + dense::template make_convert_to_ell *&, + decltype(result)>); } @@ -370,8 +384,8 @@ void Dense::convert_to(Ell *result) const { conversion_helper( result, this, - dense::template make_convert_to_ell *&>); + dense::template make_convert_to_ell *&, + decltype(result)>); } @@ -387,8 +401,8 @@ void Dense::convert_to(Hybrid *result) const { conversion_helper( result, this, - dense::template make_convert_to_hybrid *&>); + dense::template make_convert_to_hybrid *&, + decltype(result)>); } @@ -404,8 +418,8 @@ void Dense::convert_to(Hybrid *result) const { conversion_helper( result, this, - dense::template make_convert_to_hybrid *&>); + dense::template make_convert_to_hybrid *&, + decltype(result)>); } @@ -421,8 +435,8 @@ void Dense::convert_to(Sellp *result) const { conversion_helper( result, this, - dense::template make_convert_to_sellp *&>); + dense::template make_convert_to_sellp *&, + decltype(result)>); } @@ -438,8 +452,8 @@ void Dense::convert_to(Sellp *result) const { conversion_helper( result, this, - dense::template make_convert_to_sellp *&>); + dense::template make_convert_to_sellp *&, + decltype(result)>); } @@ -453,9 +467,10 @@ void Dense::move_to(Sellp *result) template void Dense::convert_to(SparsityCsr *result) const { - conversion_helper(result, this, - dense::template make_convert_to_sparsity_csr< - decltype(result), const Dense *&>); + conversion_helper( + result, this, + dense::template make_convert_to_sparsity_csr *&, + decltype(result)>); } @@ -469,9 +484,10 @@ void Dense::move_to(SparsityCsr *result) template void Dense::convert_to(SparsityCsr *result) const { - conversion_helper(result, this, - dense::template make_convert_to_sparsity_csr< - decltype(result), const Dense *&>); + conversion_helper( + result, this, + dense::template make_convert_to_sparsity_csr *&, + decltype(result)>); } @@ -572,7 +588,7 @@ std::unique_ptr Dense::transpose() const auto exec = this->get_executor(); auto trans_cpy = Dense::create(exec, gko::transpose(this->get_size())); - exec->run(dense::make_transpose(trans_cpy.get(), this)); + exec->run(dense::make_transpose(this, trans_cpy.get())); return std::move(trans_cpy); } @@ -584,11 +600,135 @@ std::unique_ptr Dense::conj_transpose() const auto exec = this->get_executor(); auto trans_cpy = Dense::create(exec, gko::transpose(this->get_size())); - exec->run(dense::make_conj_transpose(trans_cpy.get(), this)); + exec->run(dense::make_conj_transpose(this, trans_cpy.get())); return std::move(trans_cpy); } +template +std::unique_ptr Dense::row_permute( + const Array *permutation_indices) const +{ + GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); + auto exec = this->get_executor(); + auto permute_cpy = Dense::create(exec, this->get_size()); + + exec->run( + dense::make_row_permute(permutation_indices, this, permute_cpy.get())); + + return std::move(permute_cpy); +} + + +template +std::unique_ptr Dense::column_permute( + const Array *permutation_indices) const +{ + GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); + auto exec = this->get_executor(); + auto permute_cpy = Dense::create(exec, this->get_size()); + + exec->run(dense::make_column_permute(permutation_indices, this, + permute_cpy.get())); + + return std::move(permute_cpy); +} + + +template +std::unique_ptr Dense::row_permute( + const Array *permutation_indices) const +{ + GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); + auto exec = this->get_executor(); + auto permute_cpy = Dense::create(exec, this->get_size()); + + exec->run( + dense::make_row_permute(permutation_indices, this, permute_cpy.get())); + + return std::move(permute_cpy); +} + + +template +std::unique_ptr Dense::column_permute( + const Array *permutation_indices) const +{ + GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); + auto exec = this->get_executor(); + auto permute_cpy = Dense::create(exec, this->get_size()); + + exec->run(dense::make_column_permute(permutation_indices, this, + permute_cpy.get())); + + return std::move(permute_cpy); +} + + +template +std::unique_ptr Dense::inverse_row_permute( + const Array *inverse_permutation_indices) const +{ + GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(), + this->get_size()[0]); + auto exec = this->get_executor(); + auto inverse_permute_cpy = Dense::create(exec, this->get_size()); + + exec->run(dense::make_inverse_row_permute(inverse_permutation_indices, this, + inverse_permute_cpy.get())); + + return std::move(inverse_permute_cpy); +} + + +template +std::unique_ptr Dense::inverse_column_permute( + const Array *inverse_permutation_indices) const +{ + GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(), + this->get_size()[1]); + auto exec = this->get_executor(); + auto inverse_permute_cpy = Dense::create(exec, this->get_size()); + + exec->run(dense::make_inverse_column_permute( + inverse_permutation_indices, this, inverse_permute_cpy.get())); + + return std::move(inverse_permute_cpy); +} + + +template +std::unique_ptr Dense::inverse_row_permute( + const Array *inverse_permutation_indices) const +{ + GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(), + this->get_size()[0]); + auto exec = this->get_executor(); + auto inverse_permute_cpy = Dense::create(exec, this->get_size()); + + exec->run(dense::make_inverse_row_permute(inverse_permutation_indices, this, + inverse_permute_cpy.get())); + + return std::move(inverse_permute_cpy); +} + + +template +std::unique_ptr Dense::inverse_column_permute( + const Array *inverse_permutation_indices) const +{ + GKO_ASSERT_EQ(inverse_permutation_indices->get_num_elems(), + this->get_size()[1]); + auto exec = this->get_executor(); + auto inverse_permute_cpy = Dense::create(exec, this->get_size()); + + exec->run(dense::make_inverse_column_permute( + inverse_permutation_indices, this, inverse_permute_cpy.get())); + + return std::move(inverse_permute_cpy); +} + + #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX); diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 4857fb81db9..6c362eeaeb4 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,10 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_DENSE_KERNELS_HPP_ -#include #include +#include +#include + + namespace gko { namespace kernels { @@ -71,37 +74,37 @@ namespace kernels { #define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type) \ void compute_norm2(std::shared_ptr exec, \ const matrix::Dense<_type> *x, \ - matrix::Dense<_type> *result) + matrix::Dense> *result) #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec) \ void convert_to_coo(std::shared_ptr exec, \ - matrix::Coo<_type, _prec> *other, \ - const matrix::Dense<_type> *source) + const matrix::Dense<_type> *source, \ + matrix::Coo<_type, _prec> *other) #define GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(_type, _prec) \ void convert_to_csr(std::shared_ptr exec, \ - matrix::Csr<_type, _prec> *other, \ - const matrix::Dense<_type> *source) + const matrix::Dense<_type> *source, \ + matrix::Csr<_type, _prec> *other) #define GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(_type, _prec) \ void convert_to_ell(std::shared_ptr exec, \ - matrix::Ell<_type, _prec> *other, \ - const matrix::Dense<_type> *source) + const matrix::Dense<_type> *source, \ + matrix::Ell<_type, _prec> *other) #define GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(_type, _prec) \ void convert_to_hybrid(std::shared_ptr exec, \ - matrix::Hybrid<_type, _prec> *other, \ - const matrix::Dense<_type> *source) + const matrix::Dense<_type> *source, \ + matrix::Hybrid<_type, _prec> *other) #define GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(_type, _prec) \ void convert_to_sellp(std::shared_ptr exec, \ - matrix::Sellp<_type, _prec> *other, \ - const matrix::Dense<_type> *source) + const matrix::Dense<_type> *source, \ + matrix::Sellp<_type, _prec> *other) #define GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(_type, _prec) \ void convert_to_sparsity_csr(std::shared_ptr exec, \ - matrix::SparsityCsr<_type, _prec> *other, \ - const matrix::Dense<_type> *source) + const matrix::Dense<_type> *source, \ + matrix::SparsityCsr<_type, _prec> *other) #define GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL(_type) \ void count_nonzeros(std::shared_ptr exec, \ @@ -125,13 +128,37 @@ namespace kernels { #define GKO_DECLARE_TRANSPOSE_KERNEL(_type) \ void transpose(std::shared_ptr exec, \ - matrix::Dense<_type> *trans, \ - const matrix::Dense<_type> *orig) + const matrix::Dense<_type> *orig, \ + matrix::Dense<_type> *trans) #define GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(_type) \ void conj_transpose(std::shared_ptr exec, \ - matrix::Dense<_type> *trans, \ - const matrix::Dense<_type> *orig) + const matrix::Dense<_type> *orig, \ + matrix::Dense<_type> *trans) + +#define GKO_DECLARE_ROW_PERMUTE_KERNEL(_vtype, _itype) \ + void row_permute(std::shared_ptr exec, \ + const Array<_itype> *permutation_indices, \ + const matrix::Dense<_vtype> *orig, \ + matrix::Dense<_vtype> *row_permuted) + +#define GKO_DECLARE_COLUMN_PERMUTE_KERNEL(_vtype, _itype) \ + void column_permute(std::shared_ptr exec, \ + const Array<_itype> *permutation_indices, \ + const matrix::Dense<_vtype> *orig, \ + matrix::Dense<_vtype> *column_permuted) + +#define GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(_vtype, _itype) \ + void inverse_row_permute(std::shared_ptr exec, \ + const Array<_itype> *permutation_indices, \ + const matrix::Dense<_vtype> *orig, \ + matrix::Dense<_vtype> *row_permuted) + +#define GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(_vtype, _itype) \ + void inverse_column_permute(std::shared_ptr exec, \ + const Array<_itype> *permutation_indices, \ + const matrix::Dense<_vtype> *orig, \ + matrix::Dense<_vtype> *column_permuted) #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ @@ -169,7 +196,15 @@ namespace kernels { template \ GKO_DECLARE_TRANSPOSE_KERNEL(ValueType); \ template \ - GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(ValueType) + GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) namespace omp { @@ -199,6 +234,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace dense { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace dense +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 1d4bed57288..e12ad5ff83f 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -108,12 +108,32 @@ void Ell::apply_impl(const LinOp *alpha, const LinOp *b, } +template +void Ell::convert_to( + Ell, IndexType> *result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell, IndexType> *result) +{ + this->convert_to(result); +} + + template void Ell::convert_to(Dense *result) const { auto exec = this->get_executor(); auto tmp = Dense::create(exec, this->get_size()); - exec->run(ell::make_convert_to_dense(tmp.get(), this)); + exec->run(ell::make_convert_to_dense(this, tmp.get())); tmp->move_to(result); } @@ -136,7 +156,7 @@ void Ell::convert_to( auto tmp = Csr::create( exec, this->get_size(), num_stored_elements, result->get_strategy()); - exec->run(ell::make_convert_to_csr(tmp.get(), this)); + exec->run(ell::make_convert_to_csr(this, tmp.get())); tmp->make_srow(); tmp->move_to(result); diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp index 42728a5ade5..42331c9e53b 100644 --- a/core/matrix/ell_kernels.hpp +++ b/core/matrix/ell_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,9 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_ELL_KERNELS_HPP_ +#include + + #include #include -#include namespace gko { @@ -56,15 +58,15 @@ namespace kernels { const matrix::Dense *beta, \ matrix::Dense *c) -#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - matrix::Dense *result, \ - const matrix::Ell *source) +#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ + void convert_to_dense(std::shared_ptr exec, \ + const matrix::Ell *source, \ + matrix::Dense *result) -#define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ - void convert_to_csr(std::shared_ptr exec, \ - matrix::Csr *result, \ - const matrix::Ell *source) +#define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ + void convert_to_csr(std::shared_ptr exec, \ + const matrix::Ell *source, \ + matrix::Csr *result) #define GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL(ValueType, IndexType) \ void count_nonzeros(std::shared_ptr exec, \ @@ -120,6 +122,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace ell { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace ell +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 40a50a377ea..adbb48bd1aa 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -109,12 +109,32 @@ void Hybrid::apply_impl(const LinOp *alpha, } +template +void Hybrid::convert_to( + Hybrid, IndexType> *result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid, IndexType> *result) +{ + this->convert_to(result); +} + + template void Hybrid::convert_to(Dense *result) const { auto exec = this->get_executor(); auto tmp = Dense::create(exec, this->get_size()); - exec->run(hybrid::make_convert_to_dense(tmp.get(), this)); + exec->run(hybrid::make_convert_to_dense(this, tmp.get())); tmp->move_to(result); } @@ -137,7 +157,7 @@ void Hybrid::convert_to( auto tmp = Csr::create( exec, this->get_size(), num_stored_elements, result->get_strategy()); - exec->run(hybrid::make_convert_to_csr(tmp.get(), this)); + exec->run(hybrid::make_convert_to_csr(this, tmp.get())); tmp->make_srow(); tmp->move_to(result); diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp index 84230986ada..788fe66e15b 100644 --- a/core/matrix/hybrid_kernels.hpp +++ b/core/matrix/hybrid_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,23 +34,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_HYBRID_KERNELS_HPP_ -#include #include +#include + + namespace gko { namespace kernels { -#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - matrix::Dense *result, \ - const matrix::Hybrid *source) +#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ + void convert_to_dense(std::shared_ptr exec, \ + const matrix::Hybrid *source, \ + matrix::Dense *result) -#define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ - void convert_to_csr(std::shared_ptr exec, \ - matrix::Csr *result, \ - const matrix::Hybrid *source) +#define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ + void convert_to_csr(std::shared_ptr exec, \ + const matrix::Hybrid *source, \ + matrix::Csr *result) #define GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL(ValueType, IndexType) \ void count_nonzeros(std::shared_ptr exec, \ @@ -93,6 +95,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace hybrid { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace hybrid +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp index e5f0ef3a669..884e5781ee8 100644 --- a/core/matrix/identity.cpp +++ b/core/matrix/identity.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -69,6 +69,20 @@ std::unique_ptr IdentityFactory::generate_impl( } +template +std::unique_ptr Identity::transpose() const +{ + return this->clone(); +} + + +template +std::unique_ptr Identity::conj_transpose() const +{ + return this->clone(); +} + + #define GKO_DECLARE_IDENTITY_MATRIX(_type) class Identity<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDENTITY_MATRIX); #define GKO_DECLARE_IDENTITY_FACTORY(_type) class IdentityFactory<_type> diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp new file mode 100644 index 00000000000..a8b6e5ff139 --- /dev/null +++ b/core/matrix/permutation.cpp @@ -0,0 +1,45 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +namespace gko { +namespace matrix { + + +#define GKO_DECLARE_PERMUTATION_MATRIX(_type) class Permutation<_type> +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_MATRIX); + + +} // namespace matrix +} // namespace gko diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 880678b3738..5d282fbc495 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -41,12 +41,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/allocator.hpp" #include "core/matrix/sellp_kernels.hpp" -#include - - namespace gko { namespace matrix { namespace sellp { @@ -69,7 +67,7 @@ template size_type calculate_total_cols(const matrix_data &data, const size_type slice_size, const size_type stride_factor, - std::vector &slice_lengths) + vector &slice_lengths) { size_type nonzeros_per_row = 0; IndexType current_row = 0; @@ -122,12 +120,35 @@ void Sellp::apply_impl(const LinOp *alpha, const LinOp *b, } +template +void Sellp::convert_to( + Sellp, IndexType> *result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->total_cols_ = this->total_cols_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp, IndexType> *result) +{ + this->convert_to(result); +} + + template void Sellp::convert_to(Dense *result) const { auto exec = this->get_executor(); auto tmp = Dense::create(exec, this->get_size()); - exec->run(sellp::make_convert_to_dense(tmp.get(), this)); + exec->run(sellp::make_convert_to_dense(this, tmp.get())); tmp->move_to(result); } @@ -149,7 +170,7 @@ void Sellp::convert_to( exec->run(sellp::make_count_nonzeros(this, &num_stored_nonzeros)); auto tmp = Csr::create( exec, this->get_size(), num_stored_nonzeros, result->get_strategy()); - exec->run(sellp::make_convert_to_csr(tmp.get(), this)); + exec->run(sellp::make_convert_to_csr(this, tmp.get())); tmp->make_srow(); tmp->move_to(result); } @@ -175,7 +196,7 @@ void Sellp::read(const mat_data &data) // Allocate space for slice_cols. size_type slice_num = static_cast((data.size[0] + slice_size - 1) / slice_size); - std::vector slice_lengths(slice_num, 0); + vector slice_lengths(slice_num, 0, {this->get_executor()}); // Get the number of maximum columns for every slice. auto total_cols = diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp index 0c45b6d7b82..fcd0114e25b 100644 --- a/core/matrix/sellp_kernels.hpp +++ b/core/matrix/sellp_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,9 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_SELLP_KERNELS_HPP_ +#include + + #include #include -#include namespace gko { @@ -56,15 +58,15 @@ namespace kernels { const matrix::Dense *beta, \ matrix::Dense *c) -#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - matrix::Dense *result, \ - const matrix::Sellp *source) +#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ + void convert_to_dense(std::shared_ptr exec, \ + const matrix::Sellp *source, \ + matrix::Dense *result) -#define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ - void convert_to_csr(std::shared_ptr exec, \ - matrix::Csr *result, \ - const matrix::Sellp *source) +#define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ + void convert_to_csr(std::shared_ptr exec, \ + const matrix::Sellp *source, \ + matrix::Csr *result) #define GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL(ValueType, IndexType) \ void count_nonzeros(std::shared_ptr exec, \ @@ -111,6 +113,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace sellp { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace sellp +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 3c7cfa1363f..851dcd946a5 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - #include #include #include @@ -152,7 +149,7 @@ std::unique_ptr SparsityCsr::transpose() const auto trans_cpy = SparsityCsr::create(exec, gko::transpose(this->get_size()), this->get_num_nonzeros()); - exec->run(sparsity_csr::make_transpose(trans_cpy.get(), this)); + exec->run(sparsity_csr::make_transpose(this, trans_cpy.get())); return std::move(trans_cpy); } @@ -177,7 +174,7 @@ SparsityCsr::to_adjacency_matrix() const this->get_num_nonzeros() - num_diagonal_elements); exec->run(sparsity_csr::make_remove_diagonal_elements( - adj_mat.get(), this->get_const_row_ptrs(), this->get_const_col_idxs())); + this->get_const_row_ptrs(), this->get_const_col_idxs(), adj_mat.get())); return std::move(adj_mat); } diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index f9af3dcdffa..58ec58e789f 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,9 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_SPARSITY_CSR_KERNELS_HPP_ +#include + + #include #include -#include namespace gko { @@ -60,8 +62,8 @@ namespace kernels { IndexType) \ void remove_diagonal_elements( \ std::shared_ptr exec, \ - matrix::SparsityCsr *matrix, \ - const IndexType *row_ptrs, const IndexType *col_idxs) + const IndexType *row_ptrs, const IndexType *col_idxs, \ + matrix::SparsityCsr *matrix) #define GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL(ValueType, \ IndexType) \ @@ -70,10 +72,10 @@ namespace kernels { const matrix::SparsityCsr *matrix, \ size_type *num_diagonal_elements) -#define GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \ - void transpose(std::shared_ptr exec, \ - matrix::SparsityCsr *trans, \ - const matrix::SparsityCsr *orig) +#define GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \ + void transpose(std::shared_ptr exec, \ + const matrix::SparsityCsr *orig, \ + matrix::SparsityCsr *trans) #define GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType) \ void sort_by_column_index( \ @@ -133,6 +135,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace sparsity_csr { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace sparsity_csr +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp new file mode 100644 index 00000000000..0b8738c5594 --- /dev/null +++ b/core/preconditioner/isai.cpp @@ -0,0 +1,246 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/preconditioner/isai_kernels.hpp" + + +namespace gko { +namespace preconditioner { +namespace isai { + + +GKO_REGISTER_OPERATION(generate_tri_inverse, isai::generate_tri_inverse); +GKO_REGISTER_OPERATION(generate_excess_system, isai::generate_excess_system); +GKO_REGISTER_OPERATION(scatter_excess_solution, isai::scatter_excess_solution); + + +} // namespace isai + + +/** + * @internal + * + * Helper function that converts the given matrix to the (const) CSR format with + * additional sorting. + * + * If the given matrix was already sorted, is on the same executor and with a + * dynamic type of `const Csr`, the same pointer is returned with an empty + * deleter. + * In all other cases, a new matrix is created, which stores the converted Csr + * matrix. + * If `skip_sorting` is false, the matrix will be sorted by column index, + * otherwise, it will not be sorted. + */ +template +std::shared_ptr convert_to_csr_and_sort( + std::shared_ptr &exec, std::shared_ptr mtx, + bool skip_sorting) +{ + static_assert( + std::is_same>::value, + "The given `Csr` type must be of type `matrix::Csr`!"); + if (skip_sorting && exec == mtx->get_executor()) { + auto csr_mtx = std::dynamic_pointer_cast(mtx); + if (csr_mtx) { + // Here, we can just forward the pointer with an empty deleter + // since it is already sorted and in the correct format + return csr_mtx; + } + } + auto copy = Csr::create(exec); + as>(mtx)->convert_to(lend(copy)); + // Here, we assume that a sorted matrix converted to CSR will also be + // sorted + if (!skip_sorting) { + copy->sort_by_column_index(); + } + return {std::move(copy)}; +} + + +/** + * @internal + * + * Helper function that extends the sparsity pattern of the matrix M to M^n + * without changing its values. + * + * The input matrix must be sorted and on the correct executor for this to work. + * If `power` is 1, the matrix will be returned unchanged. + */ +template +std::shared_ptr extend_sparsity(std::shared_ptr &exec, + std::shared_ptr mtx, int power) +{ + GKO_ASSERT_EQ(power >= 1, true); + if (power == 1) { + // copy the matrix, as it will be used to store the inverse + return {std::move(mtx->clone())}; + } + auto id_power = mtx->clone(); + auto tmp = Csr::create(exec, mtx->get_size()); + // accumulates mtx * the remainder from odd powers + auto acc = mtx->clone(); + // compute id^(n-1) using square-and-multiply + int i = power - 1; + while (i > 1) { + if (i % 2 != 0) { + // store one power in acc: + // i^(2n+1) -> i*i^2n + id_power->apply(lend(acc), lend(tmp)); + std::swap(acc, tmp); + i--; + } + // square id_power: i^2n -> (i^2)^n + id_power->apply(lend(id_power), lend(tmp)); + std::swap(id_power, tmp); + i /= 2; + } + // combine acc and id_power again + id_power->apply(lend(acc), lend(tmp)); + return {std::move(tmp)}; +} + + +template +void Isai::generate_inverse( + std::shared_ptr input, bool skip_sorting, int power) +{ + using Dense = matrix::Dense; + using LowerTrs = solver::LowerTrs; + using UpperTrs = solver::UpperTrs; + GKO_ASSERT_IS_SQUARE_MATRIX(input); + auto exec = this->get_executor(); + auto to_invert = convert_to_csr_and_sort(exec, input, skip_sorting); + auto inverted = extend_sparsity(exec, to_invert, power); + auto num_rows = inverted->get_size()[0]; + auto is_lower = IsaiType == isai_type::lower; + + // This stores the beginning of the RHS for the sparse block associated with + // each row of inverted_l + Array excess_block_ptrs{exec, num_rows + 1}; + // This stores the beginning of the non-zeros belonging to each row in the + // system of excess blocks + Array excess_row_ptrs_full{exec, num_rows + 1}; + + exec->run(isai::make_generate_tri_inverse( + lend(to_invert), lend(inverted), excess_block_ptrs.get_data(), + excess_row_ptrs_full.get_data(), is_lower)); + + auto excess_dim = + exec->copy_val_to_host(excess_block_ptrs.get_const_data() + num_rows); + // if we had long rows: + if (excess_dim > 0) { + // build the excess sparse triangular system + auto excess_nnz = exec->copy_val_to_host( + excess_row_ptrs_full.get_const_data() + num_rows); + auto excess_system = + Csr::create(exec, dim<2>(excess_dim, excess_dim), excess_nnz); + auto excess_rhs = Dense::create(exec, dim<2>(excess_dim, 1)); + auto excess_solution = Dense::create(exec, dim<2>(excess_dim, 1)); + exec->run(isai::make_generate_excess_system( + lend(to_invert), lend(inverted), excess_block_ptrs.get_const_data(), + excess_row_ptrs_full.get_const_data(), lend(excess_system), + lend(excess_rhs))); + // solve it after transposing + std::unique_ptr trs_factory; + if (is_lower) { + trs_factory = UpperTrs::build().on(exec); + } else { + trs_factory = LowerTrs::build().on(exec); + } + trs_factory->generate(share(excess_system->transpose())) + ->apply(lend(excess_rhs), lend(excess_solution)); + // and copy the results back to the original ISAI + exec->run(isai::make_scatter_excess_solution( + excess_block_ptrs.get_const_data(), lend(excess_solution), + lend(inverted))); + } + + approximate_inverse_ = std::move(inverted); +} + + +template +std::unique_ptr Isai::transpose() const +{ + std::unique_ptr transp{ + new transposed_type{this->get_executor()}}; + transp->set_size(gko::transpose(this->get_size())); + transp->approximate_inverse_ = + share(as(this->get_approximate_inverse()->transpose())); + + return std::move(transp); +} + + +template +std::unique_ptr Isai::conj_transpose() + const +{ + std::unique_ptr transp{ + new transposed_type{this->get_executor()}}; + transp->set_size(gko::transpose(this->get_size())); + transp->approximate_inverse_ = + share(as(this->get_approximate_inverse()->conj_transpose())); + + return std::move(transp); +} + + +#define GKO_DECLARE_LOWER_ISAI(ValueType, IndexType) \ + class Isai +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_ISAI); + +#define GKO_DECLARE_UPPER_ISAI(ValueType, IndexType) \ + class Isai +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_ISAI); + + +} // namespace preconditioner +} // namespace gko diff --git a/core/preconditioner/isai_kernels.hpp b/core/preconditioner/isai_kernels.hpp new file mode 100644 index 00000000000..ce53d51cd3c --- /dev/null +++ b/core/preconditioner/isai_kernels.hpp @@ -0,0 +1,121 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_ +#define GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_ + + +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL(ValueType, IndexType) \ + void generate_tri_inverse(std::shared_ptr exec, \ + const matrix::Csr *input, \ + matrix::Csr *inverse, \ + IndexType *excess_rhs_ptrs, \ + IndexType *excess_nz_ptrs, bool lower) + +#define GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL(ValueType, IndexType) \ + void generate_excess_system( \ + std::shared_ptr exec, \ + const matrix::Csr *input, \ + const matrix::Csr *inverse, \ + const IndexType *excess_rhs_ptrs, const IndexType *excess_nz_ptrs, \ + matrix::Csr *excess_system, \ + matrix::Dense *excess_rhs) + +#define GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL(ValueType, IndexType) \ + void scatter_excess_solution( \ + std::shared_ptr exec, \ + const IndexType *excess_rhs_ptrs, \ + const matrix::Dense *excess_solution, \ + matrix::Csr *inverse) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + constexpr auto row_size_limit = 32; \ + template \ + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL(ValueType, IndexType) + + +namespace omp { +namespace isai { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace isai +} // namespace omp + + +namespace cuda { +namespace isai { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace isai +} // namespace cuda + + +namespace reference { +namespace isai { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace isai +} // namespace reference + + +namespace hip { +namespace isai { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace isai +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_ diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index 954ad6e480d..f7351cd779c 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -55,6 +55,8 @@ GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply); GKO_REGISTER_OPERATION(apply, jacobi::apply); GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks); GKO_REGISTER_OPERATION(generate, jacobi::generate); +GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi); +GKO_REGISTER_OPERATION(conj_transpose_jacobi, jacobi::conj_transpose_jacobi); GKO_REGISTER_OPERATION(convert_to_dense, jacobi::convert_to_dense); GKO_REGISTER_OPERATION(initialize_precisions, jacobi::initialize_precisions); @@ -142,6 +144,48 @@ void Jacobi::write(mat_data &data) const } +template +std::unique_ptr Jacobi::transpose() const +{ + auto res = std::unique_ptr>( + new Jacobi(this->get_executor())); + // Jacobi enforces square matrices, so no dim transposition necessary + res->set_size(this->get_size()); + res->storage_scheme_ = storage_scheme_; + res->num_blocks_ = num_blocks_; + res->blocks_.resize_and_reset(blocks_.get_num_elems()); + res->conditioning_ = conditioning_; + res->parameters_ = parameters_; + this->get_executor()->run(jacobi::make_transpose_jacobi( + num_blocks_, parameters_.max_block_size, + parameters_.storage_optimization.block_wise, parameters_.block_pointers, + blocks_, storage_scheme_, res->blocks_)); + + return std::move(res); +} + + +template +std::unique_ptr Jacobi::conj_transpose() const +{ + auto res = std::unique_ptr>( + new Jacobi(this->get_executor())); + // Jacobi enforces square matrices, so no dim transposition necessary + res->set_size(this->get_size()); + res->storage_scheme_ = storage_scheme_; + res->num_blocks_ = num_blocks_; + res->blocks_.resize_and_reset(blocks_.get_num_elems()); + res->conditioning_ = conditioning_; + res->parameters_ = parameters_; + this->get_executor()->run(jacobi::make_conj_transpose_jacobi( + num_blocks_, parameters_.max_block_size, + parameters_.storage_optimization.block_wise, parameters_.block_pointers, + blocks_, storage_scheme_, res->blocks_)); + + return std::move(res); +} + + template void Jacobi::detect_blocks( const matrix::Csr *system_matrix) @@ -159,8 +203,7 @@ void Jacobi::detect_blocks( template void Jacobi::generate(const LinOp *system_matrix) { - GKO_ASSERT_EQUAL_DIMENSIONS(system_matrix, - transpose(system_matrix->get_size())); + GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix); const auto exec = this->get_executor(); const auto csr_mtx = copy_and_convert_to>( exec, system_matrix); diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp index 9c839cf556d..12d232c26f8 100644 --- a/core/preconditioner/jacobi_kernels.hpp +++ b/core/preconditioner/jacobi_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,10 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_PRECONDITIONER_JACOBI_KERNELS_HPP_ -#include #include +#include + + namespace gko { namespace kernels { @@ -83,6 +85,28 @@ namespace kernels { const Array &blocks, const matrix::Dense *b, \ matrix::Dense *x) +#define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) \ + void transpose_jacobi( \ + std::shared_ptr exec, size_type num_blocks, \ + uint32 max_block_size, \ + const Array &block_precisions, \ + const Array &block_pointers, \ + const Array &blocks, \ + const preconditioner::block_interleaved_storage_scheme \ + &storage_scheme, \ + Array &out_blocks) + +#define GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType) \ + void conj_transpose_jacobi( \ + std::shared_ptr exec, size_type num_blocks, \ + uint32 max_block_size, \ + const Array &block_precisions, \ + const Array &block_pointers, \ + const Array &blocks, \ + const preconditioner::block_interleaved_storage_scheme \ + &storage_scheme, \ + Array &out_blocks) + #define GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ void convert_to_dense( \ std::shared_ptr exec, size_type num_blocks, \ @@ -108,6 +132,10 @@ namespace kernels { template \ GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL() @@ -139,6 +167,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace jacobi { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace jacobi +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 7d46d7187ea..904820cbce2 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp new file mode 100644 index 00000000000..a7519f48a33 --- /dev/null +++ b/core/solver/bicg.cpp @@ -0,0 +1,240 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/bicg_kernels.hpp" + + +namespace gko { +namespace solver { + + +namespace bicg { + + +GKO_REGISTER_OPERATION(initialize, bicg::initialize); +GKO_REGISTER_OPERATION(step_1, bicg::step_1); +GKO_REGISTER_OPERATION(step_2, bicg::step_2); + + +} // namespace bicg + + +template +std::unique_ptr Bicg::transpose() const +{ + return build() + .with_generated_preconditioner( + share(as(this->get_preconditioner())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); +} + + +template +std::unique_ptr Bicg::conj_transpose() const +{ + return build() + .with_generated_preconditioner(share( + as(this->get_preconditioner())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} + + +/** + * @internal + * Transposes the matrix by converting it into a CSR matrix of type + * CsrType, followed by transposing. + * + * @param mtx Matrix to transpose + * @tparam CsrType Matrix format in which the matrix mtx is converted into + * before transposing it + */ +template +std::unique_ptr transpose_with_csr(const LinOp *mtx) +{ + auto csr_matrix_unique_ptr = copy_and_convert_to( + mtx->get_executor(), const_cast(mtx)); + + csr_matrix_unique_ptr->set_strategy( + std::make_shared()); + + return csr_matrix_unique_ptr->transpose(); +} + + +template +void Bicg::apply_impl(const LinOp *b, LinOp *x) const +{ + using std::swap; + using Vector = matrix::Dense; + constexpr uint8 RelativeStoppingId{1}; + + auto exec = this->get_executor(); + + auto one_op = initialize({one()}, exec); + auto neg_one_op = initialize({-one()}, exec); + + auto dense_b = as(b); + auto dense_x = as(x); + auto r = Vector::create_with_config_of(dense_b); + auto r2 = Vector::create_with_config_of(dense_b); + auto z = Vector::create_with_config_of(dense_b); + auto z2 = Vector::create_with_config_of(dense_b); + auto p = Vector::create_with_config_of(dense_b); + auto p2 = Vector::create_with_config_of(dense_b); + auto q = Vector::create_with_config_of(dense_b); + auto q2 = Vector::create_with_config_of(dense_b); + + auto alpha = Vector::create(exec, dim<2>{1, dense_b->get_size()[1]}); + auto beta = Vector::create_with_config_of(alpha.get()); + auto prev_rho = Vector::create_with_config_of(alpha.get()); + auto rho = Vector::create_with_config_of(alpha.get()); + + bool one_changed{}; + Array stop_status(alpha->get_executor(), + dense_b->get_size()[1]); + + // TODO: replace this with automatic merged kernel generator + exec->run(bicg::make_initialize( + dense_b, r.get(), z.get(), p.get(), q.get(), prev_rho.get(), rho.get(), + r2.get(), z2.get(), p2.get(), q2.get(), &stop_status)); + // rho = 0.0 + // prev_rho = 1.0 + // z = p = q = 0 + // r = r2 = dense_b + // z2 = p2 = q2 = 0 + + std::unique_ptr trans_A; + auto transposable_system_matrix = + dynamic_cast(system_matrix_.get()); + + if (transposable_system_matrix) { + trans_A = transposable_system_matrix->transpose(); + } else { + // TODO Extend when adding more IndexTypes + // Try to figure out the IndexType that can be used for the CSR matrix + using Csr32 = matrix::Csr; + using Csr64 = matrix::Csr; + auto supports_int64 = + dynamic_cast *>(system_matrix_.get()); + if (supports_int64) { + trans_A = transpose_with_csr(system_matrix_.get()); + } else { + trans_A = transpose_with_csr(system_matrix_.get()); + } + } + + auto trans_preconditioner_tmp = + as(get_preconditioner().get()); + auto trans_preconditioner = trans_preconditioner_tmp->transpose(); + + system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), r.get()); + // r = r - Ax = -1.0 * A*dense_x + 1.0*r + r2->copy_from(r.get()); + // r2 = r + auto stop_criterion = stop_criterion_factory_->generate( + system_matrix_, std::shared_ptr(b, [](const LinOp *) {}), + x, r.get()); + + int iter = -1; + + while (true) { + get_preconditioner()->apply(r.get(), z.get()); + trans_preconditioner->apply(r2.get(), z2.get()); + z->compute_dot(r2.get(), rho.get()); + + ++iter; + this->template log(this, iter, r.get(), + dense_x); + if (stop_criterion->update() + .num_iterations(iter) + .residual(r.get()) + .solution(dense_x) + .check(RelativeStoppingId, true, &stop_status, &one_changed)) { + break; + } + + exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(), + rho.get(), prev_rho.get(), &stop_status)); + // tmp = rho / prev_rho + // p = z + tmp * p + // p2 = z2 + tmp * p2 + system_matrix_->apply(p.get(), q.get()); + trans_A->apply(p2.get(), q2.get()); + p2->compute_dot(q.get(), beta.get()); + exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(), + q.get(), q2.get(), beta.get(), rho.get(), + &stop_status)); + // tmp = rho / beta + // x = x + tmp * p + // r = r - tmp * q + // r2 = r2 - tmp * q2 + swap(prev_rho, rho); + } +} + + +template +void Bicg::apply_impl(const LinOp *alpha, const LinOp *b, + const LinOp *beta, LinOp *x) const +{ + auto dense_x = as>(x); + + auto x_clone = dense_x->clone(); + this->apply(b, x_clone.get()); + dense_x->scale(beta); + dense_x->add_scaled(alpha, x_clone.get()); +} + + +#define GKO_DECLARE_BICG(_type) class Bicg<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG); + + +} // namespace solver +} // namespace gko diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp new file mode 100644 index 00000000000..9ef21b3a243 --- /dev/null +++ b/core/solver/bicg_kernels.hpp @@ -0,0 +1,134 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_SOLVER_BICG_KERNELS_HPP_ +#define GKO_CORE_SOLVER_BICG_KERNELS_HPP_ + + +#include +#include +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace bicg { + + +#define GKO_DECLARE_BICG_INITIALIZE_KERNEL(_type) \ + void initialize(std::shared_ptr exec, \ + const matrix::Dense<_type> *b, matrix::Dense<_type> *r, \ + matrix::Dense<_type> *z, matrix::Dense<_type> *p, \ + matrix::Dense<_type> *q, matrix::Dense<_type> *prev_rho, \ + matrix::Dense<_type> *rho, matrix::Dense<_type> *r2, \ + matrix::Dense<_type> *z2, matrix::Dense<_type> *p2, \ + matrix::Dense<_type> *q2, \ + Array *stop_status) + + +#define GKO_DECLARE_BICG_STEP_1_KERNEL(_type) \ + void step_1(std::shared_ptr exec, \ + matrix::Dense<_type> *p, const matrix::Dense<_type> *z, \ + matrix::Dense<_type> *p2, const matrix::Dense<_type> *z2, \ + const matrix::Dense<_type> *rho, \ + const matrix::Dense<_type> *prev_rho, \ + const Array *stop_status) + + +#define GKO_DECLARE_BICG_STEP_2_KERNEL(_type) \ + void step_2(std::shared_ptr exec, \ + matrix::Dense<_type> *x, matrix::Dense<_type> *r, \ + matrix::Dense<_type> *r2, const matrix::Dense<_type> *p, \ + const matrix::Dense<_type> *q, const matrix::Dense<_type> *q2, \ + const matrix::Dense<_type> *beta, \ + const matrix::Dense<_type> *rho, \ + const Array *stop_status) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BICG_INITIALIZE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BICG_STEP_1_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BICG_STEP_2_KERNEL(ValueType) + + +} // namespace bicg + + +namespace omp { +namespace bicg { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace bicg +} // namespace omp + + +namespace cuda { +namespace bicg { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace bicg +} // namespace cuda + + +namespace reference { +namespace bicg { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace bicg +} // namespace reference + + +namespace hip { +namespace bicg { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace bicg +} // namespace hip + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_SOLVER_BICG_KERNELS_HPP_ diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 3d1b55346af..570c9daee6f 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -58,6 +58,32 @@ GKO_REGISTER_OPERATION(finalize, bicgstab::finalize); } // namespace bicgstab +template +std::unique_ptr Bicgstab::transpose() const +{ + return build() + .with_generated_preconditioner( + share(as(this->get_preconditioner())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); +} + + +template +std::unique_ptr Bicgstab::conj_transpose() const +{ + return build() + .with_generated_preconditioner(share( + as(this->get_preconditioner())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} + + template void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const { diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp index 0080cdb73ef..8b48151a50f 100644 --- a/core/solver/bicgstab_kernels.hpp +++ b/core/solver/bicgstab_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -139,6 +139,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace bicgstab { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace bicgstab +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index aa1a59111b1..838ede4a882 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,6 +59,32 @@ GKO_REGISTER_OPERATION(step_2, cg::step_2); } // namespace cg +template +std::unique_ptr Cg::transpose() const +{ + return build() + .with_generated_preconditioner( + share(as(this->get_preconditioner())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); +} + + +template +std::unique_ptr Cg::conj_transpose() const +{ + return build() + .with_generated_preconditioner(share( + as(this->get_preconditioner())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} + + template void Cg::apply_impl(const LinOp *b, LinOp *x) const { diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp index 59e79e60c0a..3a52974033a 100644 --- a/core/solver/cg_kernels.hpp +++ b/core/solver/cg_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -110,6 +110,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace cg { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace cg +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -117,4 +126,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_SOLVER_CG_KERNELS_HPP +#endif // GKO_CORE_SOLVER_CG_KERNELS_HPP_ diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 7ac786745be..f92f9afc30f 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,6 +59,32 @@ GKO_REGISTER_OPERATION(step_3, cgs::step_3); } // namespace cgs +template +std::unique_ptr Cgs::transpose() const +{ + return build() + .with_generated_preconditioner( + share(as(this->get_preconditioner())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); +} + + +template +std::unique_ptr Cgs::conj_transpose() const +{ + return build() + .with_generated_preconditioner(share( + as(this->get_preconditioner())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} + + template void Cgs::apply_impl(const LinOp *b, LinOp *x) const { diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp index 11899f374ea..1404303b2ce 100644 --- a/core/solver/cgs_kernels.hpp +++ b/core/solver/cgs_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include + namespace gko { namespace kernels { namespace cgs { @@ -125,6 +126,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace cgs { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace cgs +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -132,4 +142,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_SOLVER_CGS_KERNELS_HPP +#endif // GKO_CORE_SOLVER_CGS_KERNELS_HPP_ diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index 11e86aa4d3a..595476f1637 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -56,6 +56,32 @@ GKO_REGISTER_OPERATION(step_2, fcg::step_2); } // namespace fcg +template +std::unique_ptr Fcg::transpose() const +{ + return build() + .with_generated_preconditioner( + share(as(this->get_preconditioner())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); +} + + +template +std::unique_ptr Fcg::conj_transpose() const +{ + return build() + .with_generated_preconditioner(share( + as(this->get_preconditioner())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} + + template void Fcg::apply_impl(const LinOp *b, LinOp *x) const { @@ -67,7 +93,6 @@ void Fcg::apply_impl(const LinOp *b, LinOp *x) const constexpr uint8 RelativeStoppingId{1}; auto exec = this->get_executor(); - size_type num_vectors = dense_b->get_size()[1]; auto one_op = initialize({one()}, exec); auto neg_one_op = initialize({-one()}, exec); diff --git a/core/solver/fcg_kernels.hpp b/core/solver/fcg_kernels.hpp index 28be5ade514..dc269f2fa19 100644 --- a/core/solver/fcg_kernels.hpp +++ b/core/solver/fcg_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -111,6 +111,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace fcg { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace fcg +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index b56ef394d7b..9e9c39c3848 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -63,33 +63,32 @@ GKO_REGISTER_OPERATION(step_2, gmres::step_2); } // namespace gmres -namespace { - - template -void apply_preconditioner( - const LinOp *preconditioner, matrix::Dense *krylov_bases, - std::shared_ptr> &preconditioned_vector, - const size_type iter) +std::unique_ptr Gmres::transpose() const { - std::shared_ptr> target_basis = - krylov_bases->create_submatrix( - span{0, krylov_bases->get_size()[0]}, - span{iter * preconditioned_vector->get_size()[1], - (iter + 1) * preconditioned_vector->get_size()[1]}); - - // Apply preconditioner - auto identity_pointer = - dynamic_cast *>(preconditioner); - if (identity_pointer) { - preconditioned_vector = target_basis; - } else { - preconditioner->apply(target_basis.get(), preconditioned_vector.get()); - } + return build() + .with_generated_preconditioner( + share(as(this->get_preconditioner())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .with_krylov_dim(this->get_krylov_dim()) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); } -} // namespace +template +std::unique_ptr Gmres::conj_transpose() const +{ + return build() + .with_generated_preconditioner(share( + as(this->get_preconditioner())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .with_krylov_dim(this->get_krylov_dim()) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} template @@ -98,6 +97,7 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix_); using Vector = matrix::Dense; + using NormVector = matrix::Dense>; constexpr uint8 RelativeStoppingId{1}; @@ -110,9 +110,8 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const auto dense_x = as(x); auto residual = Vector::create_with_config_of(dense_b); auto krylov_bases = Vector::create( - exec, dim<2>{system_matrix_->get_size()[1], - (krylov_dim_ + 1) * dense_b->get_size()[1]}); - auto next_krylov_basis = Vector::create_with_config_of(dense_b); + exec, dim<2>{system_matrix_->get_size()[1] * (krylov_dim_ + 1), + dense_b->get_size()[1]}); std::shared_ptr> preconditioned_vector = Vector::create_with_config_of(dense_b); auto hessenberg = Vector::create( @@ -124,8 +123,7 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const auto residual_norm_collection = Vector::create(exec, dim<2>{krylov_dim_ + 1, dense_b->get_size()[1]}); auto residual_norm = - Vector::create(exec, dim<2>{1, dense_b->get_size()[1]}); - auto b_norm = Vector::create(exec, dim<2>{1, dense_b->get_size()[1]}); + NormVector::create(exec, dim<2>{1, dense_b->get_size()[1]}); Array final_iter_nums(this->get_executor(), dense_b->get_size()[1]); auto y = Vector::create(exec, dim<2>{krylov_dim_, dense_b->get_size()[1]}); @@ -135,21 +133,19 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const dense_b->get_size()[1]); // Initialization - exec->run(gmres::make_initialize_1(dense_b, b_norm.get(), residual.get(), + exec->run(gmres::make_initialize_1(dense_b, residual.get(), givens_sin.get(), givens_cos.get(), &stop_status, krylov_dim_)); - // b_norm = norm(b) // residual = dense_b // givens_sin = givens_cos = 0 system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), residual.get()); // residual = residual - Ax - exec->run(gmres::make_initialize_2( residual.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), &final_iter_nums, krylov_dim_)); // residual_norm = norm(residual) - // residual_norm_collection = {residual_norm, 0, ..., 0} + // residual_norm_collection = {residual_norm, unchanged} // krylov_bases(:, 1) = residual / residual_norm // final_iter_nums = {0, ..., 0} @@ -178,6 +174,7 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const break; } + if (restart_iter == krylov_dim_) { // Restart exec->run(gmres::make_step_2(residual_norm_collection.get(), @@ -186,12 +183,13 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const &final_iter_nums)); // Solve upper triangular. // y = hessenberg \ residual_norm_collection + // before_preconditioner = krylov_bases * y get_preconditioner()->apply(before_preconditioner.get(), after_preconditioner.get()); dense_x->add_scaled(one_op.get(), after_preconditioner.get()); // Solve x - // x = x + get_preconditioner() * krylov_bases * y + // x = x + get_preconditioner() * before_preconditioner residual->copy_from(dense_b); // residual = dense_b system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), @@ -202,16 +200,23 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const residual_norm_collection.get(), krylov_bases.get(), &final_iter_nums, krylov_dim_)); // residual_norm = norm(residual) - // residual_norm_collection = {residual_norm, 0, ..., 0} + // residual_norm_collection = {residual_norm, unchanged} // krylov_bases(:, 1) = residual / residual_norm // final_iter_nums = {0, ..., 0} restart_iter = 0; } - - apply_preconditioner(get_preconditioner().get(), krylov_bases.get(), - preconditioned_vector, restart_iter); - // preconditioned_vector = get_preconditioner() * - // krylov_bases(:, restart_iter) + auto this_krylov = krylov_bases->create_submatrix( + span{system_matrix_->get_size()[0] * restart_iter, + system_matrix_->get_size()[0] * (restart_iter + 1)}, + span{0, dense_b->get_size()[1]}); + + auto next_krylov = krylov_bases->create_submatrix( + span{system_matrix_->get_size()[0] * (restart_iter + 1), + system_matrix_->get_size()[0] * (restart_iter + 2)}, + span{0, dense_b->get_size()[1]}); + get_preconditioner()->apply(this_krylov.get(), + preconditioned_vector.get()); + // preconditioned_vector = get_preconditioner() * this_krylov // Do Arnoldi and givens rotation auto hessenberg_iter = hessenberg->create_submatrix( @@ -220,46 +225,58 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const dense_b->get_size()[1] * (restart_iter + 1)}); // Start of arnoldi - system_matrix_->apply(preconditioned_vector.get(), - next_krylov_basis.get()); - // next_krylov_basis = A * preconditioned_vector + system_matrix_->apply(preconditioned_vector.get(), next_krylov.get()); + // next_krylov = A * preconditioned_vector exec->run(gmres::make_step_1( - next_krylov_basis.get(), givens_sin.get(), givens_cos.get(), + dense_b->get_size()[0], givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), - krylov_bases.get(), hessenberg_iter.get(), b_norm.get(), - restart_iter, &final_iter_nums, &stop_status)); - // for i in 0:restart_iter + krylov_bases.get(), hessenberg_iter.get(), restart_iter, + &final_iter_nums, &stop_status)); + // final_iter_nums += 1 (unconverged) + // next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases + // for i in 0:restart_iter(include) // hessenberg(restart_iter, i) = next_krylov_basis' * - // krylov_bases(:, i) next_krylov_basis -= hessenberg(restart_iter, - // i) * krylov_bases(:, i) + // krylov_bases(:, i) + // next_krylov_basis -= hessenberg(restart_iter, i) * + // krylov_bases(:, i) // end - // hessenberg(restart_iter, restart_iter + 1) = norm(next_krylov_basis) - // next_krylov_basis /= hessenberg(restart_iter, restart_iter + 1) + // hessenberg(restart_iter+1, restart_iter) = norm(next_krylov_basis) + // next_krylov_basis /= hessenberg(restart_iter + 1, restart_iter) // End of arnoldi // Start apply givens rotation - // for j in 0:restart_iter + // for j in 0:restart_iter(exclude) // temp = cos(j)*hessenberg(j) + // sin(j)*hessenberg(j+1) - // hessenberg(j+1) = -sin(j)*hessenberg(j) + - // cos(j)*hessenberg(j+1) + // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + + // conj(cos(j))*hessenberg(j+1) // hessenberg(j) = temp; // end // Calculate sin and cos + // this_hess = hessenberg(restart_iter) + // next_hess = hessenberg(restart_iter+1) + // hypotenuse = sqrt(this_hess * this_hess + next_hess * next_hess); + // cos(restart_iter) = conj(this_hess) / hypotenuse; + // sin(restart_iter) = conj(next_hess) / this_hess // hessenberg(restart_iter) = - // cos(restart_iter)*hessenberg(restart_iter) + - // sin(restart_iter)*hessenberg(restart_iter) + // cos(restart_iter)*hessenberg(restart_iter) + + // sin(restart_iter)*hessenberg(restart_iter) // hessenberg(restart_iter+1) = 0 // End apply givens rotation // Calculate residual norm + // this_rnc = residual_norm_collection(restart_iter) + // next_rnc = -conj(sin(restart_iter)) * this_rnc + // residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc + // residual_norm = abs(next_rnc) + // residual_norm_collection(restart_iter + 1) = next_rnc restart_iter++; } // Solve x auto krylov_bases_small = krylov_bases->create_submatrix( - span{0, system_matrix_->get_size()[0]}, - span{0, dense_b->get_size()[1] * (restart_iter + 1)}); + span{0, system_matrix_->get_size()[0] * (restart_iter + 1)}, + span{0, dense_b->get_size()[1]}); auto hessenberg_small = hessenberg->create_submatrix( span{0, restart_iter}, span{0, dense_b->get_size()[1] * (restart_iter)}); @@ -270,12 +287,12 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const &final_iter_nums)); // Solve upper triangular. // y = hessenberg \ residual_norm_collection - + // before_preconditioner = krylov_bases * y get_preconditioner()->apply(before_preconditioner.get(), after_preconditioner.get()); dense_x->add_scaled(one_op.get(), after_preconditioner.get()); // Solve x - // x = x + get_preconditioner() * krylov_bases * y + // x = x + get_preconditioner() * before_preconditioner } diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index ecb448e3e20..644a8cf708e 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,40 +40,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include + namespace gko { namespace kernels { namespace gmres { -#define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type) \ - void initialize_1( \ - std::shared_ptr exec, \ - const matrix::Dense<_type> *b, matrix::Dense<_type> *b_norm, \ - matrix::Dense<_type> *residual, matrix::Dense<_type> *givens_sin, \ - matrix::Dense<_type> *givens_cos, Array *stop_status, \ - size_type krylov_dim) +#define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type) \ + void initialize_1( \ + std::shared_ptr exec, \ + const matrix::Dense<_type> *b, matrix::Dense<_type> *residual, \ + matrix::Dense<_type> *givens_sin, matrix::Dense<_type> *givens_cos, \ + Array *stop_status, size_type krylov_dim) -#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type) \ - void initialize_2(std::shared_ptr exec, \ - const matrix::Dense<_type> *residual, \ - matrix::Dense<_type> *residual_norm, \ - matrix::Dense<_type> *residual_norm_collection, \ - matrix::Dense<_type> *krylov_bases, \ +#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type) \ + void initialize_2(std::shared_ptr exec, \ + const matrix::Dense<_type> *residual, \ + matrix::Dense> *residual_norm, \ + matrix::Dense<_type> *residual_norm_collection, \ + matrix::Dense<_type> *krylov_bases, \ Array *final_iter_nums, size_type krylov_dim) -#define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type) \ - void step_1(std::shared_ptr exec, \ - matrix::Dense<_type> *next_krylov_basis, \ - matrix::Dense<_type> *givens_sin, \ - matrix::Dense<_type> *givens_cos, \ - matrix::Dense<_type> *residual_norm, \ - matrix::Dense<_type> *residual_norm_collection, \ - matrix::Dense<_type> *krylov_bases, \ - matrix::Dense<_type> *hessenberg_iter, \ - const matrix::Dense<_type> *b_norm, size_type iter, \ - Array *final_iter_nums, \ +#define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type) \ + void step_1(std::shared_ptr exec, \ + size_type num_rows, matrix::Dense<_type> *givens_sin, \ + matrix::Dense<_type> *givens_cos, \ + matrix::Dense> *residual_norm, \ + matrix::Dense<_type> *residual_norm_collection, \ + matrix::Dense<_type> *krylov_bases, \ + matrix::Dense<_type> *hessenberg_iter, size_type iter, \ + Array *final_iter_nums, \ const Array *stop_status) @@ -128,6 +126,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace gmres { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace gmres +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -135,4 +142,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_SOLVER_GMRES_KERNELS_HPP +#endif // GKO_CORE_SOLVER_GMRES_KERNELS_HPP_ diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp index f1ea9bd51e4..63e80f86c04 100644 --- a/core/solver/ir.cpp +++ b/core/solver/ir.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -50,6 +50,34 @@ GKO_REGISTER_OPERATION(initialize, ir::initialize); } // namespace ir +template +std::unique_ptr Ir::transpose() const +{ + return build() + .with_generated_solver( + share(as(this->get_solver())->transpose())) + .with_criteria(this->stop_criterion_factory_) + .with_relaxation_factor(parameters_.relaxation_factor) + .on(this->get_executor()) + ->generate( + share(as(this->get_system_matrix())->transpose())); +} + + +template +std::unique_ptr Ir::conj_transpose() const +{ + return build() + .with_generated_solver( + share(as(this->get_solver())->conj_transpose())) + .with_criteria(this->stop_criterion_factory_) + .with_relaxation_factor(conj(parameters_.relaxation_factor)) + .on(this->get_executor()) + ->generate(share( + as(this->get_system_matrix())->conj_transpose())); +} + + template void Ir::apply_impl(const LinOp *b, LinOp *x) const { @@ -63,6 +91,7 @@ void Ir::apply_impl(const LinOp *b, LinOp *x) const auto dense_b = as(b); auto dense_x = as(x); auto residual = Vector::create_with_config_of(dense_b); + auto inner_solution = Vector::create_with_config_of(dense_b); bool one_changed{}; Array stop_status(exec, dense_b->get_size()[1]); @@ -91,10 +120,30 @@ void Ir::apply_impl(const LinOp *b, LinOp *x) const break; } - solver_->apply(lend(one_op), lend(residual), lend(one_op), dense_x); - residual->copy_from(dense_b); - system_matrix_->apply(lend(neg_one_op), dense_x, lend(one_op), - lend(residual)); + if (solver_->apply_uses_initial_guess()) { + // Use the inner solver to solve + // A * inner_solution = residual + // with residual as initial guess. + inner_solution->copy_from(lend(residual)); + solver_->apply(lend(residual), lend(inner_solution)); + + // x = x + relaxation_factor * inner_solution + dense_x->add_scaled(lend(relaxation_factor_), lend(inner_solution)); + + // residual = b - A * x + residual->copy_from(dense_b); + system_matrix_->apply(lend(neg_one_op), dense_x, lend(one_op), + lend(residual)); + } else { + // x = x + relaxation_factor * A \ residual + solver_->apply(lend(relaxation_factor_), lend(residual), + lend(one_op), dense_x); + + // residual = b - A * x + residual->copy_from(dense_b); + system_matrix_->apply(lend(neg_one_op), dense_x, lend(one_op), + lend(residual)); + } } } diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp index 56c78e9e853..9fe59ba4a6c 100644 --- a/core/solver/ir_kernels.hpp +++ b/core/solver/ir_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -83,6 +83,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace ir { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace ir +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -90,4 +99,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_SOLVER_IR_KERNELS_HPP +#endif // GKO_CORE_SOLVER_IR_KERNELS_HPP_ diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp index 987e8609573..bb4bb19c25b 100644 --- a/core/solver/lower_trs.cpp +++ b/core/solver/lower_trs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/solver/lower_trs_kernels.hpp" @@ -61,6 +62,26 @@ GKO_REGISTER_OPERATION(solve, lower_trs::solve); } // namespace lower_trs +template +std::unique_ptr LowerTrs::transpose() const +{ + return transposed_type::build() + .with_num_rhs(this->parameters_.num_rhs) + .on(this->get_executor()) + ->generate(share(this->get_system_matrix()->transpose())); +} + + +template +std::unique_ptr LowerTrs::conj_transpose() const +{ + return transposed_type::build() + .with_num_rhs(this->parameters_.num_rhs) + .on(this->get_executor()) + ->generate(share(this->get_system_matrix()->conj_transpose())); +} + + template void LowerTrs::init_trs_solve_struct() { diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp index b2c931d76cf..799c50129e0 100644 --- a/core/solver/lower_trs_kernels.hpp +++ b/core/solver/lower_trs_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,13 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_SOLVER_LOWER_TRS_KERNELS_HPP_ +#include + + #include #include #include #include -#include namespace gko { @@ -112,6 +114,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace lower_trs { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace lower_trs +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -119,4 +130,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_SOLVER_LOWER_TRS_KERNELS_HPP +#endif // GKO_CORE_SOLVER_LOWER_TRS_KERNELS_HPP_ diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp index 081ce3aac01..236de82a27b 100644 --- a/core/solver/upper_trs.cpp +++ b/core/solver/upper_trs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/solver/upper_trs_kernels.hpp" @@ -61,6 +62,26 @@ GKO_REGISTER_OPERATION(solve, upper_trs::solve); } // namespace upper_trs +template +std::unique_ptr UpperTrs::transpose() const +{ + return transposed_type::build() + .with_num_rhs(this->parameters_.num_rhs) + .on(this->get_executor()) + ->generate(share(this->get_system_matrix()->transpose())); +} + + +template +std::unique_ptr UpperTrs::conj_transpose() const +{ + return transposed_type::build() + .with_num_rhs(this->parameters_.num_rhs) + .on(this->get_executor()) + ->generate(share(this->get_system_matrix()->conj_transpose())); +} + + template void UpperTrs::init_trs_solve_struct() { diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp index 34e4426ff68..cce48ea2812 100644 --- a/core/solver/upper_trs_kernels.hpp +++ b/core/solver/upper_trs_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,13 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_SOLVER_UPPER_TRS_KERNELS_HPP_ +#include + + #include #include #include #include -#include namespace gko { @@ -112,6 +114,15 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace reference +namespace hip { +namespace upper_trs { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace upper_trs +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -119,4 +130,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_SOLVER_UPPER_TRS_KERNELS_HPP +#endif // GKO_CORE_SOLVER_UPPER_TRS_KERNELS_HPP_ diff --git a/core/stop/combined.cpp b/core/stop/combined.cpp index 502d868a78e..f80df54b90b 100644 --- a/core/stop/combined.cpp +++ b/core/stop/combined.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include diff --git a/core/stop/criterion.cpp b/core/stop/criterion.cpp index db225012a20..25019d7d0d7 100644 --- a/core/stop/criterion.cpp +++ b/core/stop/criterion.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp index b666b9ef8dd..07eb8f2798c 100644 --- a/core/stop/criterion_kernels.hpp +++ b/core/stop/criterion_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -78,6 +78,15 @@ GKO_DECLARE_SET_ALL_STATUSES_KERNEL(); } // namespace set_all_statuses } // namespace reference + + +namespace hip { +namespace set_all_statuses { + +GKO_DECLARE_SET_ALL_STATUSES_KERNEL(); + +} // namespace set_all_statuses +} // namespace hip } // namespace kernels } // namespace gko diff --git a/core/stop/iteration.cpp b/core/stop/iteration.cpp index 684ed00ec1e..8c1a6bc5a7d 100644 --- a/core/stop/iteration.cpp +++ b/core/stop/iteration.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include diff --git a/core/stop/residual_norm_reduction.cpp b/core/stop/residual_norm.cpp similarity index 57% rename from core/stop/residual_norm_reduction.cpp rename to core/stop/residual_norm.cpp index 35285a37dc3..5c928bbf48d 100644 --- a/core/stop/residual_norm_reduction.cpp +++ b/core/stop/residual_norm.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,34 +30,34 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include -#include - -#include "core/stop/residual_norm_reduction_kernels.hpp" +#include "core/components/fill_array.hpp" +#include "core/stop/residual_norm_kernels.hpp" namespace gko { namespace stop { -namespace residual_norm_reduction { +namespace residual_norm { -GKO_REGISTER_OPERATION(residual_norm_reduction, - residual_norm_reduction::residual_norm_reduction); +GKO_REGISTER_OPERATION(residual_norm, residual_norm::residual_norm); +GKO_REGISTER_OPERATION(fill_array, components::fill_array); -} // namespace residual_norm_reduction +} // namespace residual_norm template -bool ResidualNormReduction::check_impl( - uint8 stoppingId, bool setFinalized, Array *stop_status, - bool *one_changed, const Criterion::Updater &updater) +bool ResidualNorm::check_impl(uint8 stoppingId, bool setFinalized, + Array *stop_status, + bool *one_changed, + const Criterion::Updater &updater) { - std::unique_ptr u_dense_tau; - const Vector *dense_tau; + const NormVector *dense_tau; if (updater.residual_norm_ != nullptr) { - dense_tau = as(updater.residual_norm_); + dense_tau = as(updater.residual_norm_); } else if (updater.residual_ != nullptr) { auto *dense_r = as(updater.residual_); dense_r->compute_norm2(u_dense_tau_.get()); @@ -67,18 +67,29 @@ bool ResidualNormReduction::check_impl( } bool all_converged = true; - this->get_executor()->run( - residual_norm_reduction::make_residual_norm_reduction( - dense_tau, starting_tau_.get(), parameters_.reduction_factor, - stoppingId, setFinalized, stop_status, &this->device_storage_, - &all_converged, one_changed)); + this->get_executor()->run(residual_norm::make_residual_norm( + dense_tau, starting_tau_.get(), tolerance_, stoppingId, setFinalized, + stop_status, &device_storage_, &all_converged, one_changed)); + return all_converged; } +template +void AbsoluteResidualNorm::initialize_starting_tau() +{ + this->get_executor()->run(residual_norm::make_fill_array( + this->starting_tau_->get_values(), this->starting_tau_->get_size()[1], + gko::one>())); +} + + +#define GKO_DECLARE_RESIDUAL_NORM(_type) class ResidualNorm<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM); + -#define GKO_DECLARE_RESIDUAL_NORM_REDUCTION(_type) \ - class ResidualNormReduction<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION); +#define GKO_DECLARE_ABSOLUTE_RESIDUAL_NORM(_type) \ + class AbsoluteResidualNorm<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_ABSOLUTE_RESIDUAL_NORM); } // namespace stop diff --git a/core/stop/residual_norm_reduction_kernels.hpp b/core/stop/residual_norm_kernels.hpp similarity index 71% rename from core/stop/residual_norm_reduction_kernels.hpp rename to core/stop/residual_norm_kernels.hpp index e56be461448..30407cf9b9f 100644 --- a/core/stop/residual_norm_reduction_kernels.hpp +++ b/core/stop/residual_norm_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_KERNELS_HPP_ -#define GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_KERNELS_HPP_ +#ifndef GKO_CORE_STOP_RESIDUAL_NORM_KERNELS_HPP_ +#define GKO_CORE_STOP_RESIDUAL_NORM_KERNELS_HPP_ #include @@ -43,56 +43,65 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace kernels { -namespace residual_norm_reduction { +namespace residual_norm { -#define GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(_type) \ - void residual_norm_reduction( \ +#define GKO_DECLARE_RESIDUAL_NORM_KERNEL(_type) \ + void residual_norm( \ std::shared_ptr exec, \ const matrix::Dense<_type> *tau, const matrix::Dense<_type> *orig_tau, \ - remove_complex<_type> rel_residual_goal, uint8 stoppingId, \ - bool setFinalized, Array *stop_status, \ - Array *device_storage, bool *all_converged, bool *one_changed) + _type rel_residual_goal, uint8 stoppingId, bool setFinalized, \ + Array *stop_status, Array *device_storage, \ + bool *all_converged, bool *one_changed) #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ - GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(ValueType) + GKO_DECLARE_RESIDUAL_NORM_KERNEL(ValueType) -} // namespace residual_norm_reduction +} // namespace residual_norm namespace omp { -namespace residual_norm_reduction { +namespace residual_norm { GKO_DECLARE_ALL_AS_TEMPLATES; -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace omp namespace cuda { -namespace residual_norm_reduction { +namespace residual_norm { GKO_DECLARE_ALL_AS_TEMPLATES; -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace cuda namespace reference { -namespace residual_norm_reduction { +namespace residual_norm { GKO_DECLARE_ALL_AS_TEMPLATES; -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace reference +namespace hip { +namespace residual_norm { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace residual_norm +} // namespace hip + + #undef GKO_DECLARE_ALL_AS_TEMPLATES } // namespace kernels } // namespace gko -#endif // GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_KERNELS_HPP_ +#endif // GKO_CORE_STOP_RESIDUAL_NORM_KERNELS_HPP_ diff --git a/core/stop/time.cpp b/core/stop/time.cpp index aea2e6c3952..8ec4ad4948a 100644 --- a/core/stop/time.cpp +++ b/core/stop/time.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index 26497b7346f..c757d4dcd1d 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_ -#define GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_ +#ifndef GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_HPP_ +#define GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_HPP_ + + +#include #include @@ -39,9 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - namespace gko { namespace syn { @@ -76,4 +76,4 @@ namespace syn { } // namespace gko -#endif // GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_ +#endif // GKO_CORE_SYNTHESIZER_IMPLEMENTATION_SELECTION_HPP_ diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt index 322bf38e6a7..f0e39d5c568 100644 --- a/core/test/CMakeLists.txt +++ b/core/test/CMakeLists.txt @@ -1,3 +1,5 @@ +include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake) + add_subdirectory(base) add_subdirectory(factorization) add_subdirectory(log) diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index 914b58ab1d3..1183b339e07 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_test(abstract_factory) +ginkgo_create_test(allocator) ginkgo_create_test(array) ginkgo_create_test(combination) ginkgo_create_test(composition) @@ -16,6 +17,7 @@ ginkgo_create_test(perturbation) ginkgo_create_test(polymorphic_object) ginkgo_create_test(range) ginkgo_create_test(range_accessors) +ginkgo_create_thread_test(sanitizers) ginkgo_create_test(types) ginkgo_create_test(utils) ginkgo_create_test(version) diff --git a/core/test/base/abstract_factory.cpp b/core/test/base/abstract_factory.cpp index 46591eab5b9..cf57531e7fe 100644 --- a/core/test/base/abstract_factory.cpp +++ b/core/test/base/abstract_factory.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/base/allocator.cpp b/core/test/base/allocator.cpp new file mode 100644 index 00000000000..32e4c9db85d --- /dev/null +++ b/core/test/base/allocator.cpp @@ -0,0 +1,93 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/base/allocator.hpp" + + +#include + + +#include + + +namespace { + + +TEST(ExecutorAllocator, Works) +{ + auto exec = gko::ReferenceExecutor::create(); + auto alloc = gko::ExecutorAllocator(exec); + + int *ptr{}; + ASSERT_NO_THROW(ptr = alloc.allocate(10)); + // This test can only fail with sanitizers + ptr[0] = 0; + ptr[9] = 0; + + ASSERT_NO_THROW(alloc.deallocate(ptr, 10)); +} + + +TEST(ExecutorAllocator, WorksWithStdlib) +{ + auto exec = gko::ReferenceExecutor::create(); + auto alloc = gko::ExecutorAllocator(exec); + auto vec = std::vector>(10, 0, exec); + + // This test can only fail with sanitizers + vec[0] = 0; + vec[9] = 0; +} + + +TEST(ExecutorAllocator, ComparesEqual) +{ + auto exec = gko::ReferenceExecutor::create(); + auto alloc1 = gko::ExecutorAllocator(exec); + auto alloc2 = gko::ExecutorAllocator(exec); + + ASSERT_TRUE(alloc1 == alloc2); +} + + +TEST(ExecutorAllocator, ComparesNotEqual) +{ + auto exec1 = gko::ReferenceExecutor::create(); + auto exec2 = gko::OmpExecutor::create(); + auto alloc1 = gko::ExecutorAllocator(exec1); + auto alloc2 = gko::ExecutorAllocator(exec2); + + ASSERT_TRUE(alloc1 != alloc2); +} + + +} // namespace diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index 5b02b151c9a..b1e04dd2f39 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,15 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include #include +#include "core/test/utils.hpp" + + namespace { +template class Array : public ::testing::Test { protected: Array() : exec(gko::ReferenceExecutor::create()), x(exec, 2) @@ -50,259 +57,472 @@ class Array : public ::testing::Test { x.get_data()[1] = 2; } - static void assert_equal_to_original_x(gko::Array &a) + static void assert_equal_to_original_x(gko::Array &a) { ASSERT_EQ(a.get_num_elems(), 2); - EXPECT_EQ(a.get_data()[0], 5); - EXPECT_EQ(a.get_data()[1], 2); - EXPECT_EQ(a.get_const_data()[0], 5); - EXPECT_EQ(a.get_const_data()[1], 2); + EXPECT_EQ(a.get_data()[0], T{5}); + EXPECT_EQ(a.get_data()[1], T{2}); + EXPECT_EQ(a.get_const_data()[0], T{5}); + EXPECT_EQ(a.get_const_data()[1], T{2}); } std::shared_ptr exec; - gko::Array x; + gko::Array x; }; +TYPED_TEST_CASE(Array, gko::test::ValueAndIndexTypes); -TEST_F(Array, CanBeCreatedWithoutAnExecutor) + +TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor) { - gko::Array a; + gko::Array a; ASSERT_EQ(a.get_executor(), nullptr); ASSERT_EQ(a.get_num_elems(), 0); } -TEST_F(Array, CanBeEmpty) +TYPED_TEST(Array, CanBeEmpty) { - gko::Array a(exec); + gko::Array a(this->exec); ASSERT_EQ(a.get_num_elems(), 0); } -TEST_F(Array, ReturnsNullWhenEmpty) +TYPED_TEST(Array, ReturnsNullWhenEmpty) { - gko::Array a(exec); + gko::Array a(this->exec); EXPECT_EQ(a.get_const_data(), nullptr); ASSERT_EQ(a.get_data(), nullptr); } -TEST_F(Array, CanBeCreatedFromExistingData) +TYPED_TEST(Array, CanBeCreatedFromExistingData) { - gko::Array a{exec, 3, new int[3], std::default_delete{}}; + gko::Array a{this->exec, 3, new TypeParam[3], + std::default_delete{}}; EXPECT_EQ(a.get_num_elems(), 3); } -TEST_F(Array, CanBeCreatedFromDataOnExecutor) +TYPED_TEST(Array, CanBeCreatedFromDataOnExecutor) { - gko::Array a{exec, 3, exec->alloc(3)}; + gko::Array a{this->exec, 3, + this->exec->template alloc(3)}; EXPECT_EQ(a.get_num_elems(), 3); } -TEST_F(Array, CanBeCreatedFromRange) +TYPED_TEST(Array, CanBeCreatedFromRange) { using std::begin; auto data = {1, 2, 3}; - gko::Array a{exec, begin(data), end(data)}; + gko::Array a{this->exec, begin(data), end(data)}; - EXPECT_EQ(a.get_const_data()[0], 1); - EXPECT_EQ(a.get_const_data()[1], 2); - ASSERT_EQ(a.get_const_data()[2], 3); + EXPECT_EQ(a.get_const_data()[0], TypeParam{1}); + EXPECT_EQ(a.get_const_data()[1], TypeParam{2}); + ASSERT_EQ(a.get_const_data()[2], TypeParam{3}); } -TEST_F(Array, CanBeCreatedFromInitializerList) +TYPED_TEST(Array, CanBeCreatedFromInitializerList) { - gko::Array a{exec, {1, 2, 3}}; + gko::Array a{this->exec, {1, 2, 3}}; - EXPECT_EQ(a.get_const_data()[0], 1); - EXPECT_EQ(a.get_const_data()[1], 2); - ASSERT_EQ(a.get_const_data()[2], 3); + EXPECT_EQ(a.get_const_data()[0], TypeParam{1}); + EXPECT_EQ(a.get_const_data()[1], TypeParam{2}); + ASSERT_EQ(a.get_const_data()[2], TypeParam{3}); } -TEST_F(Array, KnowsItsSize) { ASSERT_EQ(x.get_num_elems(), 2); } +TYPED_TEST(Array, KnowsItsSize) { ASSERT_EQ(this->x.get_num_elems(), 2); } -TEST_F(Array, ReturnsValidDataPtr) +TYPED_TEST(Array, ReturnsValidDataPtr) { - EXPECT_EQ(x.get_data()[0], 5); - EXPECT_EQ(x.get_data()[1], 2); + EXPECT_EQ(this->x.get_data()[0], TypeParam{5}); + EXPECT_EQ(this->x.get_data()[1], TypeParam{2}); } -TEST_F(Array, ReturnsValidConstDataPtr) +TYPED_TEST(Array, ReturnsValidConstDataPtr) { - EXPECT_EQ(x.get_const_data()[0], 5); - EXPECT_EQ(x.get_const_data()[1], 2); + EXPECT_EQ(this->x.get_const_data()[0], TypeParam{5}); + EXPECT_EQ(this->x.get_const_data()[1], TypeParam{2}); } -TEST_F(Array, KnowsItsExecutor) { ASSERT_EQ(x.get_executor(), exec); } +TYPED_TEST(Array, KnowsItsExecutor) +{ + ASSERT_EQ(this->x.get_executor(), this->exec); +} -TEST_F(Array, CanBeCopyConstructed) +TYPED_TEST(Array, CanBeCopyConstructed) { - gko::Array a(x); - x.get_data()[0] = 7; + gko::Array a(this->x); + this->x.get_data()[0] = 7; - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeMoveConstructed) +TYPED_TEST(Array, CanBeMoveConstructed) { - gko::Array a(std::move(x)); + gko::Array a(std::move(this->x)); - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeCopyConstructedToADifferentExecutor) +TYPED_TEST(Array, CanBeCopyConstructedToADifferentExecutor) { - gko::Array a{exec, x}; + gko::Array a{this->exec, this->x}; - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeMoveConstructedToADifferentExecutor) +TYPED_TEST(Array, CanBeMoveConstructedToADifferentExecutor) { - gko::Array a{exec, std::move(x)}; + gko::Array a{this->exec, std::move(this->x)}; - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeCopied) +TYPED_TEST(Array, CanBeCopied) { auto omp = gko::OmpExecutor::create(); - gko::Array a(omp, 3); + gko::Array a(omp, 3); - a = x; - x.get_data()[0] = 7; + a = this->x; + this->x.get_data()[0] = 7; - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeCopiedToExecutorlessArray) +TYPED_TEST(Array, CanBeCopiedToExecutorlessArray) { - gko::Array a; + gko::Array a; - a = x; + a = this->x; - ASSERT_EQ(a.get_executor(), x.get_executor()); - assert_equal_to_original_x(a); + ASSERT_EQ(a.get_executor(), this->x.get_executor()); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeCopiedFromExecutorlessArray) +TYPED_TEST(Array, CanBeCopiedFromExecutorlessArray) { - gko::Array a; + gko::Array a; - x = a; + this->x = a; - ASSERT_NE(x.get_executor(), nullptr); - ASSERT_EQ(x.get_num_elems(), 0); + ASSERT_NE(this->x.get_executor(), nullptr); + ASSERT_EQ(this->x.get_num_elems(), 0); } -TEST_F(Array, CanBeMoved) +TYPED_TEST(Array, CanBeMoved) { auto omp = gko::OmpExecutor::create(); - gko::Array a(omp, 3); + gko::Array a(omp, 3); - a = std::move(x); + a = std::move(this->x); - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeMovedToExecutorlessArray) +TYPED_TEST(Array, CanBeMovedToExecutorlessArray) { - gko::Array a; + gko::Array a; - a = std::move(x); + a = std::move(this->x); ASSERT_NE(a.get_executor(), nullptr); - assert_equal_to_original_x(a); + this->assert_equal_to_original_x(a); } -TEST_F(Array, CanBeMovedFromExecutorlessArray) +TYPED_TEST(Array, CanBeMovedFromExecutorlessArray) { - gko::Array a; + gko::Array a; - x = std::move(a); + this->x = std::move(a); - ASSERT_NE(x.get_executor(), nullptr); - ASSERT_EQ(x.get_num_elems(), 0); + ASSERT_NE(this->x.get_executor(), nullptr); + ASSERT_EQ(this->x.get_num_elems(), 0); } -TEST_F(Array, CanBeCleared) +TYPED_TEST(Array, CanBeCleared) { - x.clear(); + this->x.clear(); - ASSERT_EQ(x.get_num_elems(), 0); - ASSERT_EQ(x.get_data(), nullptr); - ASSERT_EQ(x.get_const_data(), nullptr); + ASSERT_EQ(this->x.get_num_elems(), 0); + ASSERT_EQ(this->x.get_data(), nullptr); + ASSERT_EQ(this->x.get_const_data(), nullptr); } -TEST_F(Array, CanBeResized) +TYPED_TEST(Array, CanBeResized) { - x.resize_and_reset(3); + this->x.resize_and_reset(3); + + this->x.get_data()[0] = 1; + this->x.get_data()[1] = 8; + this->x.get_data()[2] = 7; + + EXPECT_EQ(this->x.get_const_data()[0], TypeParam{1}); + EXPECT_EQ(this->x.get_const_data()[1], TypeParam{8}); + EXPECT_EQ(this->x.get_const_data()[2], TypeParam{7}); +} + + +TYPED_TEST(Array, ViewCannotBeResized) +{ + TypeParam data[] = {1, 2, 3}; + auto view = gko::Array::view(this->exec, 3, data); + + EXPECT_THROW(view.resize_and_reset(1), gko::NotSupported); + EXPECT_EQ(view.get_num_elems(), 3); + ASSERT_EQ(view.get_data()[0], TypeParam{1}); +} + + +template +class my_null_deleter { +public: + using pointer = T *; + + void operator()(pointer) const noexcept {} +}; + +template +class my_null_deleter { +public: + using pointer = T[]; + + void operator()(pointer) const noexcept {} +}; + - x.get_data()[0] = 1; - x.get_data()[1] = 8; - x.get_data()[2] = 7; +TYPED_TEST(Array, CustomDeleterCannotBeResized) +{ + TypeParam data[] = {1, 2, 3}; + auto view_custom_deleter = gko::Array( + this->exec, 3, data, my_null_deleter{}); - EXPECT_EQ(x.get_const_data()[0], 1); - EXPECT_EQ(x.get_const_data()[1], 8); - EXPECT_EQ(x.get_const_data()[2], 7); + EXPECT_THROW(view_custom_deleter.resize_and_reset(1), gko::NotSupported); + EXPECT_EQ(view_custom_deleter.get_num_elems(), 3); + ASSERT_EQ(view_custom_deleter.get_data()[0], TypeParam{1}); } -TEST_F(Array, CanBeAssignedAnExecutor) +TYPED_TEST(Array, CanBeAssignedAnExecutor) { - gko::Array a; + gko::Array a; - a.set_executor(exec); + a.set_executor(this->exec); - ASSERT_EQ(a.get_executor(), exec); + ASSERT_EQ(a.get_executor(), this->exec); } -TEST_F(Array, ChangesExecutors) +TYPED_TEST(Array, ChangesExecutors) { auto omp = gko::OmpExecutor::create(); - x.set_executor(omp); + this->x.set_executor(omp); + + ASSERT_EQ(this->x.get_executor(), omp); + this->assert_equal_to_original_x(this->x); +} + + +TYPED_TEST(Array, ViewModifiesOriginalData) +{ + TypeParam data[] = {1, 2, 3}; + auto view = gko::Array::view(this->exec, 3, data); + + TypeParam new_data[] = {5, 4, 2}; + std::copy(new_data, new_data + 3, view.get_data()); + + EXPECT_EQ(data[0], TypeParam{5}); + EXPECT_EQ(data[1], TypeParam{4}); + EXPECT_EQ(data[2], TypeParam{2}); + ASSERT_EQ(view.get_num_elems(), 3); +} + + +TYPED_TEST(Array, CopyArrayToArray) +{ + gko::Array array(this->exec, {1, 2, 3}); + gko::Array array2(this->exec, {5, 4, 2, 1}); + + array = array2; + + EXPECT_EQ(array.get_data()[0], TypeParam{5}); + EXPECT_EQ(array.get_data()[1], TypeParam{4}); + EXPECT_EQ(array.get_data()[2], TypeParam{2}); + EXPECT_EQ(array.get_data()[3], TypeParam{1}); + EXPECT_EQ(array.get_num_elems(), 4); + EXPECT_NE(array.get_data(), array2.get_data()); + ASSERT_EQ(array2.get_num_elems(), 4); +} + + +TYPED_TEST(Array, CopyViewToView) +{ + TypeParam data[] = {1, 2, 3}; + auto view = gko::Array::view(this->exec, 3, data); + TypeParam data2[] = {5, 4, 2}; + auto view2 = gko::Array::view(this->exec, 3, data2); + TypeParam data_size4[] = {5, 4, 2, 1}; + auto view_size4 = gko::Array::view(this->exec, 4, data_size4); + + view = view2; + view2.get_data()[0] = 2; + + EXPECT_EQ(data[0], TypeParam{5}); + EXPECT_EQ(data[1], TypeParam{4}); + EXPECT_EQ(data[2], TypeParam{2}); + EXPECT_EQ(view.get_num_elems(), 3); + EXPECT_EQ(view2.get_num_elems(), 3); + EXPECT_EQ(view2.get_data()[0], TypeParam{2}); + ASSERT_THROW(view2 = view_size4, gko::OutOfBoundsError); +} + + +TYPED_TEST(Array, CopyViewToArray) +{ + TypeParam data[] = {1, 2, 3, 4}; + auto view = gko::Array::view(this->exec, 4, data); + gko::Array array(this->exec, {5, 4, 2}); + + array = view; + view.get_data()[0] = 2; + + EXPECT_EQ(array.get_data()[0], TypeParam{1}); + EXPECT_EQ(array.get_data()[1], TypeParam{2}); + EXPECT_EQ(array.get_data()[2], TypeParam{3}); + EXPECT_EQ(array.get_data()[3], TypeParam{4}); + EXPECT_EQ(array.get_num_elems(), 4); + ASSERT_EQ(view.get_num_elems(), 4); +} + + +TYPED_TEST(Array, CopyArrayToView) +{ + TypeParam data[] = {1, 2, 3}; + auto view = gko::Array::view(this->exec, 3, data); + gko::Array array_size2(this->exec, {5, 4}); + gko::Array array_size4(this->exec, {5, 4, 2, 1}); + + view = array_size2; + + EXPECT_EQ(data[0], TypeParam{5}); + EXPECT_EQ(data[1], TypeParam{4}); + EXPECT_EQ(data[2], TypeParam{3}); + EXPECT_EQ(view.get_num_elems(), 3); + EXPECT_EQ(array_size2.get_num_elems(), 2); + ASSERT_THROW(view = array_size4, gko::OutOfBoundsError); +} + - ASSERT_EQ(x.get_executor(), omp); - assert_equal_to_original_x(x); +TYPED_TEST(Array, MoveArrayToArray) +{ + gko::Array array(this->exec, {1, 2, 3}); + gko::Array array2(this->exec, {5, 4, 2, 1}); + auto data2 = array2.get_data(); + + array = std::move(array2); + + EXPECT_EQ(array.get_data(), data2); + EXPECT_EQ(array.get_data()[0], TypeParam{5}); + EXPECT_EQ(array.get_data()[1], TypeParam{4}); + EXPECT_EQ(array.get_data()[2], TypeParam{2}); + EXPECT_EQ(array.get_data()[3], TypeParam{1}); + EXPECT_EQ(array.get_num_elems(), 4); + EXPECT_EQ(array2.get_data(), nullptr); + ASSERT_EQ(array2.get_num_elems(), 0); } -TEST_F(Array, CanCreateView) +TYPED_TEST(Array, MoveViewToView) { - int data[] = {1, 2, 3}; + TypeParam data[] = {1, 2, 3, 4}; + auto view = gko::Array::view(this->exec, 4, data); + TypeParam data2[] = {5, 4, 2}; + auto view2 = gko::Array::view(this->exec, 3, data2); + + view = std::move(view2); + + EXPECT_EQ(view.get_data(), data2); + EXPECT_EQ(view.get_data()[0], TypeParam{5}); + EXPECT_EQ(view.get_data()[1], TypeParam{4}); + EXPECT_EQ(view.get_data()[2], TypeParam{2}); + EXPECT_EQ(view.get_num_elems(), 3); + EXPECT_EQ(view2.get_data(), nullptr); + EXPECT_EQ(view2.get_num_elems(), 0); + EXPECT_NE(data, nullptr); + EXPECT_EQ(data[0], TypeParam{1}); + EXPECT_EQ(data[1], TypeParam{2}); + EXPECT_EQ(data[2], TypeParam{3}); + ASSERT_EQ(data[3], TypeParam{4}); +} - auto view = gko::Array::view(exec, 3, data); - view = gko::Array{exec, {5, 4, 2}}; - EXPECT_EQ(data[0], 5); - EXPECT_EQ(data[1], 4); - EXPECT_EQ(data[2], 2); +TYPED_TEST(Array, MoveViewToArray) +{ + TypeParam data[] = {1, 2, 3, 4}; + gko::Array array(this->exec, {5, 4, 2}); + auto view = gko::Array::view(this->exec, 4, data); + + array = std::move(view); + + EXPECT_EQ(array.get_data(), data); + EXPECT_EQ(array.get_data()[0], TypeParam{1}); + EXPECT_EQ(array.get_data()[1], TypeParam{2}); + EXPECT_EQ(array.get_data()[2], TypeParam{3}); + EXPECT_EQ(array.get_data()[3], TypeParam{4}); + EXPECT_EQ(array.get_num_elems(), 4); + EXPECT_EQ(data[0], TypeParam{1}); + EXPECT_EQ(data[1], TypeParam{2}); + EXPECT_EQ(data[2], TypeParam{3}); + EXPECT_EQ(data[3], TypeParam{4}); + EXPECT_EQ(view.get_data(), nullptr); + ASSERT_EQ(view.get_num_elems(), 0); +} + + +TYPED_TEST(Array, MoveArrayToView) +{ + TypeParam data[] = {1, 2, 3}; + auto view = gko::Array::view(this->exec, 3, data); + gko::Array array_size2(this->exec, {5, 4}); + gko::Array array_size4(this->exec, {5, 4, 2, 1}); + auto size2_ptr = array_size2.get_data(); + auto size4_ptr = array_size4.get_data(); + + view = std::move(array_size2); + + EXPECT_EQ(view.get_data()[0], TypeParam{5}); + EXPECT_EQ(view.get_data()[1], TypeParam{4}); + EXPECT_EQ(view.get_num_elems(), 2); + EXPECT_NE(view.get_data(), data); + EXPECT_EQ(view.get_data(), size2_ptr); + EXPECT_NO_THROW(view = std::move(array_size4)); + EXPECT_EQ(view.get_data(), size4_ptr); + EXPECT_EQ(array_size2.get_data(), nullptr); + ASSERT_EQ(array_size2.get_num_elems(), 0); } diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp index f22772d5f1a..8c2eaf35b6f 100644 --- a/core/test/base/combination.cpp +++ b/core/test/base/combination.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { @@ -55,6 +58,7 @@ struct DummyOperator : public gko::EnableLinOp { }; +template class Combination : public ::testing::Test { protected: Combination() @@ -70,10 +74,12 @@ class Combination : public ::testing::Test { std::vector> coefficients; }; +TYPED_TEST_CASE(Combination, gko::test::ValueTypes); + -TEST_F(Combination, CanBeEmpty) +TYPED_TEST(Combination, CanBeEmpty) { - auto cmb = gko::Combination<>::create(exec); + auto cmb = gko::Combination::create(this->exec); ASSERT_EQ(cmb->get_size(), gko::dim<2>(0, 0)); ASSERT_EQ(cmb->get_coefficients().size(), 0); @@ -81,34 +87,35 @@ TEST_F(Combination, CanBeEmpty) } -TEST_F(Combination, CanCreateFromIterators) +TYPED_TEST(Combination, CanCreateFromIterators) { - auto cmb = - gko::Combination<>::create(begin(coefficients), end(coefficients), - begin(operators), end(operators)); + auto cmb = gko::Combination::create( + begin(this->coefficients), end(this->coefficients), + begin(this->operators), end(this->operators)); ASSERT_EQ(cmb->get_size(), gko::dim<2>(1, 1)); ASSERT_EQ(cmb->get_coefficients().size(), 2); ASSERT_EQ(cmb->get_operators().size(), 2); - ASSERT_EQ(cmb->get_coefficients()[0], coefficients[0]); - ASSERT_EQ(cmb->get_operators()[0], operators[0]); - ASSERT_EQ(cmb->get_coefficients()[1], coefficients[1]); - ASSERT_EQ(cmb->get_operators()[1], operators[1]); + ASSERT_EQ(cmb->get_coefficients()[0], this->coefficients[0]); + ASSERT_EQ(cmb->get_operators()[0], this->operators[0]); + ASSERT_EQ(cmb->get_coefficients()[1], this->coefficients[1]); + ASSERT_EQ(cmb->get_operators()[1], this->operators[1]); } -TEST_F(Combination, CanCreateFromList) +TYPED_TEST(Combination, CanCreateFromList) { - auto cmb = gko::Combination<>::create(coefficients[0], operators[0], - coefficients[1], operators[1]); + auto cmb = gko::Combination::create( + this->coefficients[0], this->operators[0], this->coefficients[1], + this->operators[1]); ASSERT_EQ(cmb->get_size(), gko::dim<2>(1, 1)); ASSERT_EQ(cmb->get_coefficients().size(), 2); ASSERT_EQ(cmb->get_operators().size(), 2); - ASSERT_EQ(cmb->get_coefficients()[0], coefficients[0]); - ASSERT_EQ(cmb->get_operators()[0], operators[0]); - ASSERT_EQ(cmb->get_coefficients()[1], coefficients[1]); - ASSERT_EQ(cmb->get_operators()[1], operators[1]); + ASSERT_EQ(cmb->get_coefficients()[0], this->coefficients[0]); + ASSERT_EQ(cmb->get_operators()[0], this->operators[0]); + ASSERT_EQ(cmb->get_coefficients()[1], this->coefficients[1]); + ASSERT_EQ(cmb->get_operators()[1], this->operators[1]); } diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp index bea2dbd2755..aa9df458456 100644 --- a/core/test/base/composition.cpp +++ b/core/test/base/composition.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { @@ -56,6 +59,7 @@ struct DummyOperator : public gko::EnableLinOp { }; +template class Composition : public ::testing::Test { protected: Composition() @@ -68,35 +72,39 @@ class Composition : public ::testing::Test { std::vector> operators; }; +TYPED_TEST_CASE(Composition, gko::test::ValueTypes); + -TEST_F(Composition, CanBeEmpty) +TYPED_TEST(Composition, CanBeEmpty) { - auto cmp = gko::Composition<>::create(exec); + auto cmp = gko::Composition::create(this->exec); ASSERT_EQ(cmp->get_size(), gko::dim<2>(0, 0)); ASSERT_EQ(cmp->get_operators().size(), 0); } -TEST_F(Composition, CanCreateFromIterators) +TYPED_TEST(Composition, CanCreateFromIterators) { - auto cmp = gko::Composition<>::create(begin(operators), end(operators)); + auto cmp = gko::Composition::create(begin(this->operators), + end(this->operators)); ASSERT_EQ(cmp->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(cmp->get_operators().size(), 2); - ASSERT_EQ(cmp->get_operators()[0], operators[0]); - ASSERT_EQ(cmp->get_operators()[1], operators[1]); + ASSERT_EQ(cmp->get_operators()[0], this->operators[0]); + ASSERT_EQ(cmp->get_operators()[1], this->operators[1]); } -TEST_F(Composition, CanCreateFromList) +TYPED_TEST(Composition, CanCreateFromList) { - auto cmp = gko::Composition<>::create(operators[0], operators[1]); + auto cmp = gko::Composition::create(this->operators[0], + this->operators[1]); ASSERT_EQ(cmp->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(cmp->get_operators().size(), 2); - ASSERT_EQ(cmp->get_operators()[0], operators[0]); - ASSERT_EQ(cmp->get_operators()[1], operators[1]); + ASSERT_EQ(cmp->get_operators()[0], this->operators[0]); + ASSERT_EQ(cmp->get_operators()[1], this->operators[1]); } diff --git a/core/test/base/dim.cpp b/core/test/base/dim.cpp index 80143caf9aa..b94ef6672c1 100644 --- a/core/test/base/dim.cpp +++ b/core/test/base/dim.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include @@ -48,6 +51,16 @@ TEST(Dim, ConstructsCorrectObject) } +TEST(Dim, ConstructsCorrectConstexprObject) +{ + constexpr gko::dim<3> d{4, 5, 6}; + + ASSERT_EQ(d[0], 4); + ASSERT_EQ(d[1], 5); + ASSERT_EQ(d[2], 6); +} + + TEST(Dim, ConstructsSquareObject) { gko::dim<2> d{5}; @@ -66,6 +79,34 @@ TEST(Dim, ConstructsNullObject) } +class dim_manager { +public: + using dim = gko::dim<3>; + const dim &get_size() const { return size_; } + + static std::unique_ptr create(const dim &size) + { + return std::unique_ptr{new dim_manager{size}}; + } + +private: + dim_manager(const dim &size) : size_{size} {} + dim size_; +}; + + +TEST(Dim, CopiesProperlyOnHeap) +{ + auto manager = dim_manager::create(gko::dim<3>{1, 2, 3}); + + const auto copy = manager->get_size(); + + ASSERT_EQ(copy[0], 1); + ASSERT_EQ(copy[1], 2); + ASSERT_EQ(copy[2], 3); +} + + TEST(Dim, ConvertsToBool) { gko::dim<2> d1{}; diff --git a/core/test/base/exception.cpp b/core/test/base/exception.cpp index 815a08991af..b04d7553103 100644 --- a/core/test/base/exception.cpp +++ b/core/test/base/exception.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -98,6 +98,30 @@ TEST(ExceptionClasses, CusparseErrorReturnsCorrectWhatMessage) } +TEST(ExceptionClasses, HipErrorReturnsCorrectWhatMessage) +{ + gko::HipError error("test_file.cpp", 123, "test_func", 1); + std::string expected = "test_file.cpp:123: test_func: "; + ASSERT_EQ(expected, std::string(error.what()).substr(0, expected.size())); +} + + +TEST(ExceptionClasses, HipblasErrorReturnsCorrectWhatMessage) +{ + gko::HipblasError error("test_file.cpp", 123, "test_func", 1); + std::string expected = "test_file.cpp:123: test_func: "; + ASSERT_EQ(expected, std::string(error.what()).substr(0, expected.size())); +} + + +TEST(ExceptionClasses, HipsparseErrorReturnsCorrectWhatMessage) +{ + gko::HipsparseError error("test_file.cpp", 123, "test_func", 1); + std::string expected = "test_file.cpp:123: test_func: "; + ASSERT_EQ(expected, std::string(error.what()).substr(0, expected.size())); +} + + TEST(ExceptionClasses, DimensionMismatchReturnsCorrectWhatMessage) { gko::DimensionMismatch error("test_file.cpp", 243, "test_func", "a", 3, 4, diff --git a/core/test/base/exception_helpers.cpp b/core/test/base/exception_helpers.cpp index dd013835300..cad8a3d5684 100644 --- a/core/test/base/exception_helpers.cpp +++ b/core/test/base/exception_helpers.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -55,11 +55,48 @@ TEST(NotCompiled, ThrowsWhenUsed) } -void does_not_support_int() { GKO_NOT_SUPPORTED(int); } +template +void test_not_supported_impl(const T &obj) +{ + try { + GKO_NOT_SUPPORTED(obj); + FAIL(); + } catch (gko::NotSupported &m) { + // check for equal suffix + std::string msg{m.what()}; + auto expected = gko::name_demangling::get_type_name(typeid(Expected)); + ASSERT_TRUE( + std::equal(expected.rbegin(), expected.rend(), msg.rbegin())); + } +} -TEST(NotSupported, ReturnsNotSupportedException) + +TEST(NotSupported, ReturnsIntNotSupportedException) { - ASSERT_THROW(does_not_support_int(), gko::NotSupported); + test_not_supported_impl(int{}); +} + + +struct Base { + virtual ~Base() = default; +}; + +struct Derived : Base {}; + + +TEST(NotSupported, ReturnsPtrNotSupportedException) +{ + Derived d; + Base *b = &d; + test_not_supported_impl(b); +} + + +TEST(NotSupported, ReturnsRefNotSupportedException) +{ + Derived d; + Base &b = d; + test_not_supported_impl(b); } @@ -87,6 +124,30 @@ TEST(CudaError, ReturnsCusparseError) } +void throws_hip_error() { throw GKO_HIP_ERROR(0); } + +TEST(HipError, ReturnsHipError) +{ + ASSERT_THROW(throws_hip_error(), gko::HipError); +} + + +void throws_hipblas_error() { throw GKO_HIPBLAS_ERROR(0); } + +TEST(HipError, ReturnsHipblasError) +{ + ASSERT_THROW(throws_hipblas_error(), gko::HipblasError); +} + + +void throws_hipsparse_error() { throw GKO_HIPSPARSE_ERROR(0); } + +TEST(HipError, ReturnsHipsparseError) +{ + ASSERT_THROW(throws_hipsparse_error(), gko::HipsparseError); +} + + TEST(AssertIsSquareMatrix, DoesNotThrowWhenIsSquareMatrix) { ASSERT_NO_THROW(GKO_ASSERT_IS_SQUARE_MATRIX(gko::dim<2>(3, 3))); diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index dca77292eda..1b2e1b0698e 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,10 +59,14 @@ class ExampleOperation : public gko::Operation { { value = 2; } - void run(std::shared_ptr) const override + void run(std::shared_ptr) const override { value = 3; } + void run(std::shared_ptr) const override + { + value = 4; + } int &value; }; @@ -83,9 +87,10 @@ TEST(OmpExecutor, RunsCorrectLambdaOperation) int value = 0; auto omp_lambda = [&value]() { value = 1; }; auto cuda_lambda = [&value]() { value = 2; }; + auto hip_lambda = [&value]() { value = 3; }; exec_ptr omp = gko::OmpExecutor::create(); - omp->run(omp_lambda, cuda_lambda); + omp->run(omp_lambda, cuda_lambda, hip_lambda); ASSERT_EQ(1, value); } @@ -128,7 +133,7 @@ TEST(OmpExecutor, CopiesData) int *copy = omp->alloc(num_elems); // user code is run on the OMP, so local variables are in OMP memory - omp->copy_from(omp.get(), num_elems, orig, copy); + omp->copy(num_elems, orig, copy); EXPECT_EQ(3, copy[0]); EXPECT_EQ(8, copy[1]); @@ -150,7 +155,7 @@ TEST(ReferenceExecutor, RunsCorrectOperation) exec_ptr ref = gko::ReferenceExecutor::create(); ref->run(ExampleOperation(value)); - ASSERT_EQ(3, value); + ASSERT_EQ(4, value); } @@ -159,9 +164,10 @@ TEST(ReferenceExecutor, RunsCorrectLambdaOperation) int value = 0; auto omp_lambda = [&value]() { value = 1; }; auto cuda_lambda = [&value]() { value = 2; }; + auto hip_lambda = [&value]() { value = 3; }; exec_ptr ref = gko::ReferenceExecutor::create(); - ref->run(omp_lambda, cuda_lambda); + ref->run(omp_lambda, cuda_lambda, hip_lambda); ASSERT_EQ(1, value); } @@ -204,7 +210,7 @@ TEST(ReferenceExecutor, CopiesData) int *copy = ref->alloc(num_elems); // ReferenceExecutor is a type of OMP executor, so this is O.K. - ref->copy_from(ref.get(), num_elems, orig, copy); + ref->copy(num_elems, orig, copy); EXPECT_EQ(3, copy[0]); EXPECT_EQ(8, copy[1]); @@ -212,6 +218,18 @@ TEST(ReferenceExecutor, CopiesData) } +TEST(ReferenceExecutor, CopiesSingleValue) +{ + exec_ptr ref = gko::ReferenceExecutor::create(); + int *el = ref->alloc(1); + el[0] = 83683; + + EXPECT_EQ(83683, ref->copy_val_to_host(el)); + + ref->free(el); +} + + TEST(ReferenceExecutor, CopiesDataFromOmp) { int orig[] = {3, 8}; @@ -257,7 +275,8 @@ TEST(ReferenceExecutor, IsItsOwnMaster) TEST(CudaExecutor, RunsCorrectOperation) { int value = 0; - exec_ptr cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec_ptr cuda = + gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); cuda->run(ExampleOperation(value)); ASSERT_EQ(2, value); @@ -269,9 +288,11 @@ TEST(CudaExecutor, RunsCorrectLambdaOperation) int value = 0; auto omp_lambda = [&value]() { value = 1; }; auto cuda_lambda = [&value]() { value = 2; }; - exec_ptr cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + auto hip_lambda = [&value]() { value = 3; }; + exec_ptr cuda = + gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); - cuda->run(omp_lambda, cuda_lambda); + cuda->run(omp_lambda, cuda_lambda, hip_lambda); ASSERT_EQ(2, value); } @@ -294,6 +315,105 @@ TEST(CudaExecutor, KnowsItsDeviceId) } +TEST(CudaExecutor, CanGetDeviceResetBoolean) +{ + auto omp = gko::OmpExecutor::create(); + auto cuda = gko::CudaExecutor::create(0, omp); + + ASSERT_EQ(false, cuda->get_device_reset()); +} + + +TEST(CudaExecutor, CanSetDefaultDeviceResetBoolean) +{ + auto omp = gko::OmpExecutor::create(); + auto cuda = gko::CudaExecutor::create(0, omp, true); + + ASSERT_EQ(true, cuda->get_device_reset()); +} + + +TEST(CudaExecutor, CanSetDeviceResetBoolean) +{ + auto omp = gko::OmpExecutor::create(); + auto cuda = gko::CudaExecutor::create(0, omp); + + cuda->set_device_reset(true); + + ASSERT_EQ(true, cuda->get_device_reset()); +} + + +TEST(HipExecutor, RunsCorrectOperation) +{ + int value = 0; + exec_ptr hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + + hip->run(ExampleOperation(value)); + ASSERT_EQ(3, value); +} + + +TEST(HipExecutor, RunsCorrectLambdaOperation) +{ + int value = 0; + auto omp_lambda = [&value]() { value = 1; }; + auto cuda_lambda = [&value]() { value = 2; }; + auto hip_lambda = [&value]() { value = 3; }; + exec_ptr hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + + hip->run(omp_lambda, cuda_lambda, hip_lambda); + ASSERT_EQ(3, value); +} + + +TEST(HipExecutor, KnowsItsMaster) +{ + auto omp = gko::OmpExecutor::create(); + exec_ptr hip = gko::HipExecutor::create(0, omp); + + ASSERT_EQ(omp, hip->get_master()); +} + + +TEST(HipExecutor, KnowsItsDeviceId) +{ + auto omp = gko::OmpExecutor::create(); + auto hip = gko::HipExecutor::create(0, omp); + + ASSERT_EQ(0, hip->get_device_id()); +} + + +TEST(HipExecutor, CanGetDeviceResetBoolean) +{ + auto omp = gko::OmpExecutor::create(); + auto hip = gko::HipExecutor::create(0, omp); + + ASSERT_EQ(false, hip->get_device_reset()); +} + + +TEST(HipExecutor, CanSetDefaultDeviceResetBoolean) +{ + auto omp = gko::OmpExecutor::create(); + auto hip = gko::HipExecutor::create(0, omp, true); + + ASSERT_EQ(true, hip->get_device_reset()); +} + + +TEST(HipExecutor, CanSetDeviceResetBoolean) +{ + auto omp = gko::OmpExecutor::create(); + auto hip = gko::HipExecutor::create(0, omp); + + hip->set_device_reset(true); + + ASSERT_EQ(true, hip->get_device_reset()); +} + + template struct mock_free : T { /** diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index ef31e5c9210..f2f7da597bf 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,16 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include - - -#include +#include "core/base/extended_float.hpp" #include #include +#include + + namespace { diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index 6e350e40382..b2b873a627a 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,23 +40,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class IteratorFactory : public ::testing::Test { protected: - using int_type = int; - using double_type = double; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; IteratorFactory() - : reversed_int{100, 50, 10, 9, 8, 7, 5, 5, 4, 3, 2, 1, 0, -1, -2}, - ordered_int{-2, -1, 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 50, 100}, - reversed_double{15., 14., 13., 12., 11., 10., 9., 7., - 7., 6., 5., 4., 3., 2., -1.}, - ordered_double{-1., 2., 3., 4., 5., 6., 7., 7., - 9., 10., 11., 12., 13., 14., 15.} + : reversed_index{100, 50, 10, 9, 8, 7, 5, 5, 4, 3, 2, 1, 0, -1, -2}, + ordered_index{-2, -1, 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 50, 100}, + reversed_value{15., 14., 13., 12., 11., 10., 9., 7., + 7., 6., 5., 4., 3., 2., -1.}, + ordered_value{-1., 2., 3., 4., 5., 6., 7., 7., + 9., 10., 11., 12., 13., 14., 15.} {} template @@ -87,16 +90,20 @@ class IteratorFactory : public ::testing::Test { return true; } - const std::vector reversed_int; - const std::vector ordered_int; - const std::vector reversed_double; - const std::vector ordered_double; + const std::vector reversed_index; + const std::vector ordered_index; + const std::vector reversed_value; + const std::vector ordered_value; }; +TYPED_TEST_CASE(IteratorFactory, gko::test::ValueIndexTypes); -TEST_F(IteratorFactory, EmptyIterator) + +TYPED_TEST(IteratorFactory, EmptyIterator) { - auto test_iter = gko::detail::IteratorFactory( + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + auto test_iter = gko::detail::IteratorFactory( nullptr, nullptr, 0); ASSERT_TRUE(test_iter.begin() == test_iter.end()); @@ -104,66 +111,78 @@ TEST_F(IteratorFactory, EmptyIterator) } -TEST_F(IteratorFactory, SortingReversedWithIterator) +TYPED_TEST(IteratorFactory, SortingReversedWithIterator) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); std::sort(test_iter.begin(), test_iter.end()); - check_vector_equal(vec1, ordered_int); - check_vector_equal(vec2, reversed_double); + this->check_vector_equal(vec1, this->ordered_index); + this->check_vector_equal(vec2, this->reversed_value); } -TEST_F(IteratorFactory, SortingAlreadySortedWithIterator) +TYPED_TEST(IteratorFactory, SortingAlreadySortedWithIterator) { - std::vector vec1{ordered_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->ordered_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); std::sort(test_iter.begin(), test_iter.end()); - check_vector_equal(vec1, ordered_int); - check_vector_equal(vec2, ordered_double); + this->check_vector_equal(vec1, this->ordered_index); + this->check_vector_equal(vec2, this->ordered_value); } -TEST_F(IteratorFactory, IteratorReferenceOperatorSmaller) +TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); - bool is_sorted = is_sorted_iterator(test_iter.begin(), test_iter.end()); + bool is_sorted = + this->is_sorted_iterator(test_iter.begin(), test_iter.end()); ASSERT_FALSE(is_sorted); } -TEST_F(IteratorFactory, IteratorReferenceOperatorSmaller2) +TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller2) { - std::vector vec1{ordered_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->ordered_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); - bool is_sorted = is_sorted_iterator(test_iter.begin(), test_iter.end()); + bool is_sorted = + this->is_sorted_iterator(test_iter.begin(), test_iter.end()); ASSERT_TRUE(is_sorted); } -TEST_F(IteratorFactory, IncreasingIterator) +TYPED_TEST(IteratorFactory, IncreasingIterator) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); auto begin = test_iter.begin(); auto plus_2 = begin + 2; @@ -187,12 +206,14 @@ TEST_F(IteratorFactory, IncreasingIterator) } -TEST_F(IteratorFactory, DecreasingIterator) +TYPED_TEST(IteratorFactory, DecreasingIterator) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); auto iter = test_iter.begin() + 5; auto minus_2 = iter - 2; @@ -216,16 +237,18 @@ TEST_F(IteratorFactory, DecreasingIterator) } -TEST_F(IteratorFactory, CorrectDereferencing) +TYPED_TEST(IteratorFactory, CorrectDereferencing) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type_it = typename TestFixture::index_type; + using value_type_it = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; constexpr int element_to_test = 3; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); auto begin = test_iter.begin(); - using value_type = decltype(begin)::value_type; + using value_type = typename decltype(begin)::value_type; auto to_test_ref = *(begin + element_to_test); value_type to_test_pair = to_test_ref; // Testing implicit conversion @@ -236,51 +259,55 @@ TEST_F(IteratorFactory, CorrectDereferencing) } -TEST_F(IteratorFactory, CorrectSwapping) +TYPED_TEST(IteratorFactory, CorrectSwapping) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); auto first_el_reference = *test_iter.begin(); auto second_el_reference = *(test_iter.begin() + 1); swap(first_el_reference, second_el_reference); - ASSERT_TRUE(vec1[0] == reversed_int[1]); - ASSERT_TRUE(vec1[1] == reversed_int[0]); - ASSERT_TRUE(vec2[0] == ordered_double[1]); - ASSERT_TRUE(vec2[1] == ordered_double[0]); + ASSERT_TRUE(vec1[0] == this->reversed_index[1]); + ASSERT_TRUE(vec1[1] == this->reversed_index[0]); + ASSERT_TRUE(vec2[0] == this->ordered_value[1]); + ASSERT_TRUE(vec2[1] == this->ordered_value[0]); // Make sure the other values were not touched. for (size_t i = 2; i < vec1.size(); ++i) { - ASSERT_TRUE(vec1[i] == reversed_int[i]); - ASSERT_TRUE(vec2[i] == ordered_double[i]); + ASSERT_TRUE(vec1[i] == this->reversed_index[i]); + ASSERT_TRUE(vec2[i] == this->ordered_value[i]); } } -TEST_F(IteratorFactory, CorrectHandWrittenSwapping) +TYPED_TEST(IteratorFactory, CorrectHandWrittenSwapping) { - std::vector vec1{reversed_int}; - std::vector vec2{ordered_double}; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + std::vector vec1{this->reversed_index}; + std::vector vec2{this->ordered_value}; - auto test_iter = gko::detail::IteratorFactory( + auto test_iter = gko::detail::IteratorFactory( vec1.data(), vec2.data(), vec1.size()); auto first_el_reference = *test_iter.begin(); auto second_el_reference = *(test_iter.begin() + 1); - auto temp = static_cast( + auto temp = static_cast( first_el_reference); first_el_reference = second_el_reference; second_el_reference = temp; - ASSERT_TRUE(vec1[0] == reversed_int[1]); - ASSERT_TRUE(vec1[1] == reversed_int[0]); - ASSERT_TRUE(vec2[0] == ordered_double[1]); - ASSERT_TRUE(vec2[1] == ordered_double[0]); + ASSERT_TRUE(vec1[0] == this->reversed_index[1]); + ASSERT_TRUE(vec1[1] == this->reversed_index[0]); + ASSERT_TRUE(vec2[0] == this->ordered_value[1]); + ASSERT_TRUE(vec2[1] == this->ordered_value[0]); // Make sure the other values were not touched. for (size_t i = 2; i < vec1.size(); ++i) { - ASSERT_TRUE(vec1[i] == reversed_int[i]); - ASSERT_TRUE(vec2[i] == ordered_double[i]); + ASSERT_TRUE(vec1[i] == this->reversed_index[i]); + ASSERT_TRUE(vec2[i] == this->ordered_value[i]); } } diff --git a/core/test/base/lin_op.cpp b/core/test/base/lin_op.cpp index 2ade2cd50b9..a8c2866f3cc 100644 --- a/core/test/base/lin_op.cpp +++ b/core/test/base/lin_op.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -236,6 +236,12 @@ TEST_F(EnableLinOp, ExtendedApplyCopiesBackOnlyX) } +TEST_F(EnableLinOp, ApplyUsesInitialGuessReturnsFalse) +{ + ASSERT_FALSE(op->apply_uses_initial_guess()); +} + + template class DummyLinOpWithFactory : public gko::EnableLinOp> { diff --git a/core/test/base/math.cpp b/core/test/base/math.cpp index efe0a05943b..c63cd4ae8e9 100644 --- a/core/test/base/math.cpp +++ b/core/test/base/math.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -44,30 +45,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace { +static_assert( + std::is_same()))>::value, + "real must return a real type"); +static_assert( + std::is_same()))>::value, + "imag must return a real type"); + + template -void test_real_isfinite() +void test_real_is_finite() { using limits = std::numeric_limits; constexpr auto inf = limits::infinity(); // Use volatile to avoid MSVC report divided by zero. volatile const T zero{0}; - ASSERT_TRUE(gko::isfinite(T{0})); - ASSERT_TRUE(gko::isfinite(-T{0})); - ASSERT_TRUE(gko::isfinite(T{1})); - ASSERT_FALSE(gko::isfinite(inf)); - ASSERT_FALSE(gko::isfinite(-inf)); - ASSERT_FALSE(gko::isfinite(limits::quiet_NaN())); - ASSERT_FALSE(gko::isfinite(limits::signaling_NaN())); - ASSERT_FALSE(gko::isfinite(inf - inf)); // results in nan - ASSERT_FALSE(gko::isfinite(inf / inf)); // results in nan - ASSERT_FALSE(gko::isfinite(inf * T{2})); // results in inf - ASSERT_FALSE(gko::isfinite(T{1} / zero)); // results in inf - ASSERT_FALSE(gko::isfinite(T{0} / zero)); // results in nan + ASSERT_TRUE(gko::is_finite(T{0})); + ASSERT_TRUE(gko::is_finite(-T{0})); + ASSERT_TRUE(gko::is_finite(T{1})); + ASSERT_FALSE(gko::is_finite(inf)); + ASSERT_FALSE(gko::is_finite(-inf)); + ASSERT_FALSE(gko::is_finite(limits::quiet_NaN())); + ASSERT_FALSE(gko::is_finite(limits::signaling_NaN())); + ASSERT_FALSE(gko::is_finite(inf - inf)); // results in nan + ASSERT_FALSE(gko::is_finite(inf / inf)); // results in nan + ASSERT_FALSE(gko::is_finite(inf * T{2})); // results in inf + ASSERT_FALSE(gko::is_finite(T{1} / zero)); // results in inf + ASSERT_FALSE(gko::is_finite(T{0} / zero)); // results in nan } template -void test_complex_isfinite() +void test_complex_is_finite() { static_assert(gko::is_complex_s::value, "Template type must be a complex type."); @@ -78,31 +87,52 @@ void test_complex_isfinite() constexpr auto quiet_nan = limits::quiet_NaN(); constexpr auto signaling_nan = limits::signaling_NaN(); - ASSERT_TRUE(gko::isfinite(c_type{T{0}, T{0}})); - ASSERT_TRUE(gko::isfinite(c_type{-T{0}, -T{0}})); - ASSERT_TRUE(gko::isfinite(c_type{T{1}, T{0}})); - ASSERT_TRUE(gko::isfinite(c_type{T{0}, T{1}})); - ASSERT_FALSE(gko::isfinite(c_type{inf, T{0}})); - ASSERT_FALSE(gko::isfinite(c_type{-inf, T{0}})); - ASSERT_FALSE(gko::isfinite(c_type{quiet_nan, T{0}})); - ASSERT_FALSE(gko::isfinite(c_type{signaling_nan, T{0}})); - ASSERT_FALSE(gko::isfinite(c_type{T{0}, inf})); - ASSERT_FALSE(gko::isfinite(c_type{T{0}, -inf})); - ASSERT_FALSE(gko::isfinite(c_type{T{0}, quiet_nan})); - ASSERT_FALSE(gko::isfinite(c_type{T{0}, signaling_nan})); + ASSERT_TRUE(gko::is_finite(c_type{T{0}, T{0}})); + ASSERT_TRUE(gko::is_finite(c_type{-T{0}, -T{0}})); + ASSERT_TRUE(gko::is_finite(c_type{T{1}, T{0}})); + ASSERT_TRUE(gko::is_finite(c_type{T{0}, T{1}})); + ASSERT_FALSE(gko::is_finite(c_type{inf, T{0}})); + ASSERT_FALSE(gko::is_finite(c_type{-inf, T{0}})); + ASSERT_FALSE(gko::is_finite(c_type{quiet_nan, T{0}})); + ASSERT_FALSE(gko::is_finite(c_type{signaling_nan, T{0}})); + ASSERT_FALSE(gko::is_finite(c_type{T{0}, inf})); + ASSERT_FALSE(gko::is_finite(c_type{T{0}, -inf})); + ASSERT_FALSE(gko::is_finite(c_type{T{0}, quiet_nan})); + ASSERT_FALSE(gko::is_finite(c_type{T{0}, signaling_nan})); } -TEST(IsFinite, Float) { test_real_isfinite(); } +TEST(IsFinite, Float) { test_real_is_finite(); } + + +TEST(IsFinite, Double) { test_real_is_finite(); } + + +TEST(IsFinite, FloatComplex) { test_complex_is_finite>(); } + +TEST(IsFinite, DoubleComplex) +{ + test_complex_is_finite>(); +} -TEST(IsFinite, Double) { test_real_isfinite(); } +TEST(Conjugate, FloatComplex) +{ + std::complex a(1, 1); + std::complex b(1, -1); -TEST(IsFinite, FloatComplex) { test_complex_isfinite>(); } + ASSERT_EQ(conj(a), b); +} -TEST(IsFinite, DoubleComplex) { test_complex_isfinite>(); } +TEST(Conjugate, DoubleComplex) +{ + std::complex a(1, 1); + std::complex b(1, -1); + + ASSERT_EQ(conj(a), b); +} } // namespace diff --git a/core/test/base/matrix_data.cpp b/core/test/base/matrix_data.cpp index 129c5c8c470..fcb2f48f29b 100644 --- a/core/test/base/matrix_data.cpp +++ b/core/test/base/matrix_data.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include namespace { diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp index aa733b74781..a5a17beeefc 100644 --- a/core/test/base/mtx_io.cpp +++ b/core/test/base/mtx_io.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include #include #include +#include "core/test/utils.hpp" + + namespace { -TEST(MtxReader, ReadsDenseRealMtx) +TEST(MtxReader, ReadsDenseDoubleRealMtx) { using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( @@ -72,9 +75,35 @@ TEST(MtxReader, ReadsDenseRealMtx) } -TEST(MtxReader, ReadsDenseIntegerMtx) +TEST(MtxReader, ReadsDenseDoubleRealMtxWith64Index) { - using tpl = gko::matrix_data::nonzero_type; + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array real general\n" + "2 3\n" + "1.0\n" + "0.0\n" + "3.0\n" + "5.0\n" + "2.0\n" + "0.0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto &v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, 1.0)); + ASSERT_EQ(v[1], tpl(0, 1, 3.0)); + ASSERT_EQ(v[2], tpl(0, 2, 2.0)); + ASSERT_EQ(v[3], tpl(1, 0, 0.0)); + ASSERT_EQ(v[4], tpl(1, 1, 5.0)); + ASSERT_EQ(v[5], tpl(1, 2, 0.0)); +} + + +TEST(MtxReader, ReadsDenseFloatIntegerMtx) +{ + using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( "%%MatrixMarket matrix array integer general\n" "2 3\n" @@ -85,7 +114,7 @@ TEST(MtxReader, ReadsDenseIntegerMtx) "2\n" "0\n"); - auto data = gko::read_raw(iss); + auto data = gko::read_raw(iss); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); auto &v = data.nonzeros; @@ -98,9 +127,89 @@ TEST(MtxReader, ReadsDenseIntegerMtx) } -TEST(MtxReader, ReadsDenseComplexMtx) +TEST(MtxReader, ReadsDenseFloatIntegerMtxWith64Index) +{ + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array integer general\n" + "2 3\n" + "1\n" + "0\n" + "3\n" + "5\n" + "2\n" + "0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto &v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, 1.0)); + ASSERT_EQ(v[1], tpl(0, 1, 3.0)); + ASSERT_EQ(v[2], tpl(0, 2, 2.0)); + ASSERT_EQ(v[3], tpl(1, 0, 0.0)); + ASSERT_EQ(v[4], tpl(1, 1, 5.0)); + ASSERT_EQ(v[5], tpl(1, 2, 0.0)); +} + + +TEST(MtxReader, ReadsDenseComplexDoubleMtx) +{ + using cpx = std::complex; + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array complex general\n" + "2 3\n" + "1.0 2.0\n" + "0.0 0.0\n" + "3.0 1.0\n" + "5.0 3.0\n" + "2.0 4.0\n" + "0.0 0.0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto &v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, cpx(1.0, 2.0))); + ASSERT_EQ(v[1], tpl(0, 1, cpx(3.0, 1.0))); + ASSERT_EQ(v[2], tpl(0, 2, cpx(2.0, 4.0))); + ASSERT_EQ(v[3], tpl(1, 0, cpx(0.0, 0.0))); + ASSERT_EQ(v[4], tpl(1, 1, cpx(5.0, 3.0))); + ASSERT_EQ(v[5], tpl(1, 2, cpx(0.0, 0.0))); +} + + +TEST(MtxReader, ReadsDenseComplexDoubleMtxWith64Index) { using cpx = std::complex; + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array complex general\n" + "2 3\n" + "1.0 2.0\n" + "0.0 0.0\n" + "3.0 1.0\n" + "5.0 3.0\n" + "2.0 4.0\n" + "0.0 0.0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto &v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, cpx(1.0, 2.0))); + ASSERT_EQ(v[1], tpl(0, 1, cpx(3.0, 1.0))); + ASSERT_EQ(v[2], tpl(0, 2, cpx(2.0, 4.0))); + ASSERT_EQ(v[3], tpl(1, 0, cpx(0.0, 0.0))); + ASSERT_EQ(v[4], tpl(1, 1, cpx(5.0, 3.0))); + ASSERT_EQ(v[5], tpl(1, 2, cpx(0.0, 0.0))); +} + + +TEST(MtxReader, ReadsDenseComplexFloatMtx) +{ + using cpx = std::complex; using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( "%%MatrixMarket matrix array complex general\n" @@ -125,6 +234,33 @@ TEST(MtxReader, ReadsDenseComplexMtx) } +TEST(MtxReader, ReadsDenseComplexFloatMtxWith64Index) +{ + using cpx = std::complex; + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array complex general\n" + "2 3\n" + "1.0 2.0\n" + "0.0 0.0\n" + "3.0 1.0\n" + "5.0 3.0\n" + "2.0 4.0\n" + "0.0 0.0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto &v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, cpx(1.0, 2.0))); + ASSERT_EQ(v[1], tpl(0, 1, cpx(3.0, 1.0))); + ASSERT_EQ(v[2], tpl(0, 2, cpx(2.0, 4.0))); + ASSERT_EQ(v[3], tpl(1, 0, cpx(0.0, 0.0))); + ASSERT_EQ(v[4], tpl(1, 1, cpx(5.0, 3.0))); + ASSERT_EQ(v[5], tpl(1, 2, cpx(0.0, 0.0))); +} + + TEST(MtxReader, ReadsSparseRealMtx) { using tpl = gko::matrix_data::nonzero_type; @@ -273,10 +409,57 @@ TEST(MtxReader, FailsWhenReadingSparseComplexMtxToRealMtx) } -TEST(MatrixData, WritesRealMatrixToMatrixMarketArray) +TEST(MatrixData, WritesDoubleRealMatrixToMatrixMarketArray) +{ + // clang-format off + gko::matrix_data data{ + {1.0, 2.0}, + {2.1, 0.0}, + {3.0, 3.2}}; + // clang-format on + std::ostringstream oss{}; + + write_raw(oss, data); + + ASSERT_EQ(oss.str(), + "%%MatrixMarket matrix array real general\n" + "3 2\n" + "1\n" + "2.1\n" + "3\n" + "2\n" + "0\n" + "3.2\n"); +} + + +TEST(MatrixData, WritesFloatRealMatrixToMatrixMarketCoordinate) { // clang-format off - gko::matrix_data<> data{ + gko::matrix_data data{ + {1.0, 2.0}, + {2.1, 0.0}, + {3.0, 3.2}}; + // clang-format on + std::ostringstream oss{}; + + write_raw(oss, data, gko::layout_type::coordinate); + + ASSERT_EQ(oss.str(), + "%%MatrixMarket matrix coordinate real general\n" + "3 2 5\n" + "1 1 1\n" + "1 2 2\n" + "2 1 2.1\n" + "3 1 3\n" + "3 2 3.2\n"); +} + + +TEST(MatrixData, WritesDoubleRealMatrixToMatrixMarketArrayWith64Index) +{ + // clang-format off + gko::matrix_data data{ {1.0, 2.0}, {2.1, 0.0}, {3.0, 3.2}}; @@ -297,10 +480,10 @@ TEST(MatrixData, WritesRealMatrixToMatrixMarketArray) } -TEST(MatrixData, WritesRealMatrixToMatrixMarketCoordinate) +TEST(MatrixData, WritesFloatRealMatrixToMatrixMarketCoordinateWith64Index) { // clang-format off - gko::matrix_data<> data{ + gko::matrix_data data{ {1.0, 2.0}, {2.1, 0.0}, {3.0, 3.2}}; @@ -320,10 +503,57 @@ TEST(MatrixData, WritesRealMatrixToMatrixMarketCoordinate) } -TEST(MatrixData, WritesComplexMatrixToMatrixMarketArray) +TEST(MatrixData, WritesComplexDoubleMatrixToMatrixMarketArray) +{ + // clang-format off + gko::matrix_data, gko::int32> data{ + {{1.0, 0.0}, {2.0, 3.2}}, + {{2.1, 2.2}, {0.0, 0.0}}, + {{0.0, 3.0}, {3.2, 5.3}}}; + // clang-format on + std::ostringstream oss{}; + + write_raw(oss, data); + + ASSERT_EQ(oss.str(), + "%%MatrixMarket matrix array complex general\n" + "3 2\n" + "1 0\n" + "2.1 2.2\n" + "0 3\n" + "2 3.2\n" + "0 0\n" + "3.2 5.3\n"); +} + + +TEST(MatrixData, WritesComplexFloatMatrixToMatrixMarketCoordinate) +{ + // clang-format off + gko::matrix_data, gko::int32> data{ + {{1.0, 0.0}, {2.0, 3.2}}, + {{2.1, 2.2}, {0.0, 0.0}}, + {{0.0, 3.0}, {3.2, 5.3}}}; + // clang-format on + std::ostringstream oss{}; + + write_raw(oss, data, gko::layout_type::coordinate); + + ASSERT_EQ(oss.str(), + "%%MatrixMarket matrix coordinate complex general\n" + "3 2 5\n" + "1 1 1 0\n" + "1 2 2 3.2\n" + "2 1 2.1 2.2\n" + "3 1 0 3\n" + "3 2 3.2 5.3\n"); +} + + +TEST(MatrixData, WritesComplexDoubleMatrixToMatrixMarketArrayWith64Index) { // clang-format off - gko::matrix_data> data{ + gko::matrix_data, gko::int64> data{ {{1.0, 0.0}, {2.0, 3.2}}, {{2.1, 2.2}, {0.0, 0.0}}, {{0.0, 3.0}, {3.2, 5.3}}}; @@ -344,10 +574,10 @@ TEST(MatrixData, WritesComplexMatrixToMatrixMarketArray) } -TEST(MatrixData, WritesComplexMatrixToMatrixMarketCoordinate) +TEST(MatrixData, WritesComplexFloatMatrixToMatrixMarketCoordinateWith64Index) { // clang-format off - gko::matrix_data> data{ + gko::matrix_data, gko::int64> data{ {{1.0, 0.0}, {2.0, 3.2}}, {{2.1, 2.2}, {0.0, 0.0}}, {{0.0, 3.0}, {3.2, 5.3}}}; @@ -401,9 +631,23 @@ class DummyLinOp }; -TEST(MtxReader, ReadsLinOpFromStream) +template +class RealDummyLinOpTest : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; +}; + +TYPED_TEST_CASE(RealDummyLinOpTest, gko::test::RealValueIndexTypes); + + +TYPED_TEST(RealDummyLinOpTest, ReadsLinOpFromStream) { - using tpl = gko::matrix_data::nonzero_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; std::istringstream iss( "%%MatrixMarket matrix array real general\n" "2 3\n" @@ -414,7 +658,7 @@ TEST(MtxReader, ReadsLinOpFromStream) "2.0\n" "0.0\n"); - auto lin_op = gko::read>( + auto lin_op = gko::read>( iss, gko::ReferenceExecutor::create()); const auto &data = lin_op->data_; @@ -429,9 +673,10 @@ TEST(MtxReader, ReadsLinOpFromStream) } -TEST(MtxReader, WritesLinOpToStream) +TYPED_TEST(RealDummyLinOpTest, WritesLinOpToStream) { - using tpl = gko::matrix_data::nonzero_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; std::istringstream iss( "%%MatrixMarket matrix array real general\n" "2 3\n" @@ -441,7 +686,7 @@ TEST(MtxReader, WritesLinOpToStream) "5.0\n" "2.0\n" "0.0\n"); - auto lin_op = gko::read>( + auto lin_op = gko::read>( iss, gko::ReferenceExecutor::create()); std::ostringstream oss{}; @@ -459,4 +704,77 @@ TEST(MtxReader, WritesLinOpToStream) } +template +class ComplexDummyLinOpTest : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; +}; + +TYPED_TEST_CASE(ComplexDummyLinOpTest, gko::test::ComplexValueIndexTypes); + + +TYPED_TEST(ComplexDummyLinOpTest, ReadsLinOpFromStream) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array complex general\n" + "2 3\n" + "1.0 2.0\n" + "0.0 0.0\n" + "3.0 4.0\n" + "5.0 6.0\n" + "2.0 3.0\n" + "0.0 0.0\n"); + + auto lin_op = gko::read>( + iss, gko::ReferenceExecutor::create()); + + const auto &data = lin_op->data_; + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + const auto &v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, value_type{1.0, 2.0})); + ASSERT_EQ(v[1], tpl(0, 1, value_type{3.0, 4.0})); + ASSERT_EQ(v[2], tpl(0, 2, value_type{2.0, 3.0})); + ASSERT_EQ(v[3], tpl(1, 0, value_type{0.0, 0.0})); + ASSERT_EQ(v[4], tpl(1, 1, value_type{5.0, 6.0})); + ASSERT_EQ(v[5], tpl(1, 2, value_type{0.0, 0.0})); +} + + +TYPED_TEST(ComplexDummyLinOpTest, WritesLinOpToStream) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + std::istringstream iss( + "%%MatrixMarket matrix array complex general\n" + "2 3\n" + "1.0 2.0\n" + "0.0 0.0\n" + "3.0 4.0\n" + "5.0 6.0\n" + "2.0 3.0\n" + "0.0 0.0\n"); + auto lin_op = gko::read>( + iss, gko::ReferenceExecutor::create()); + std::ostringstream oss{}; + + write(oss, lend(lin_op)); + + ASSERT_EQ(oss.str(), + "%%MatrixMarket matrix array complex general\n" + "2 3\n" + "1 2\n" + "0 0\n" + "3 4\n" + "5 6\n" + "2 3\n" + "0 0\n"); +} + + } // namespace diff --git a/core/test/base/perturbation.cpp b/core/test/base/perturbation.cpp index 950d81be2df..c0ddbd73cdc 100644 --- a/core/test/base/perturbation.cpp +++ b/core/test/base/perturbation.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/base/polymorphic_object.cpp b/core/test/base/polymorphic_object.cpp index 70d1f849a5d..2b2e32bd409 100644 --- a/core/test/base/polymorphic_object.cpp +++ b/core/test/base/polymorphic_object.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/base/range.cpp b/core/test/base/range.cpp index 4df4e3ba65e..6be342ca380 100644 --- a/core/test/base/range.cpp +++ b/core/test/base/range.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include namespace { diff --git a/core/test/base/range_accessors.cpp b/core/test/base/range_accessors.cpp index 232c5feff7b..9066566a33a 100644 --- a/core/test/base/range_accessors.cpp +++ b/core/test/base/range_accessors.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/base/sanitizers.cpp b/core/test/base/sanitizers.cpp new file mode 100644 index 00000000000..724b7f38871 --- /dev/null +++ b/core/test/base/sanitizers.cpp @@ -0,0 +1,86 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include + + +#include + + +TEST(Sanitizers, UseAfterFree) +{ + char *x = new char[50]; + x[0] = 'H'; + x[1] = 'I'; + x[2] = '\n'; + + std::free(x); + + static volatile char z = x[0]; +} + + +TEST(Sanitizers, MemoryLeak) +{ + char *x = new char[50]; + x[0] = 'H'; + x[1] = 'I'; + x[2] = '\n'; +} + + +TEST(Sanitizers, UndefinedBehavior) +{ + int x = std::numeric_limits::max(); + int y = 10001; + + static volatile int z = x + y; +} + + +int Global = 0; +void *Thread(void *x) +{ + Global = 42; + return x; +} + + +TEST(Sanitizers, RaceCondition) +{ + std::thread t(Thread, &Global); + + Global = 43; + t.join(); +} diff --git a/core/test/base/types.cpp b/core/test/base/types.cpp index 21e444e31ea..50979cd8a67 100644 --- a/core/test/base/types.cpp +++ b/core/test/base/types.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp index 07bcd6e41b0..df187cfb81f 100644 --- a/core/test/base/utils.cpp +++ b/core/test/base/utils.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -50,11 +50,17 @@ struct Base { struct Derived : Base {}; -struct NonRelated { - virtual ~NonRelated() = default; +struct NonRelated : Base {}; + + +struct Base2 { + virtual ~Base2() = default; }; +struct MultipleDerived : Base, Base2 {}; + + struct ClonableDerived : Base { ClonableDerived(std::shared_ptr exec = nullptr) : executor(exec) @@ -248,7 +254,15 @@ TEST(As, FailsToConvertIfNotRelated) Derived d; Base *b = &d; - ASSERT_THROW(gko::as(b), gko::NotSupported); + try { + gko::as(b); + FAIL(); + } catch (gko::NotSupported &m) { + std::string msg{m.what()}; + auto expected = gko::name_demangling::get_type_name(typeid(Derived)); + ASSERT_TRUE( + std::equal(expected.rbegin(), expected.rend(), msg.rbegin())); + } } @@ -266,7 +280,98 @@ TEST(As, FailsToConvertConstantIfNotRelated) Derived d; const Base *b = &d; - ASSERT_THROW(gko::as(b), gko::NotSupported); + try { + gko::as(b); + FAIL(); + } catch (gko::NotSupported &m) { + std::string msg{m.what()}; + auto expected = gko::name_demangling::get_type_name(typeid(Derived)); + ASSERT_TRUE( + std::equal(expected.rbegin(), expected.rend(), msg.rbegin())); + } +} + + +TEST(As, ConvertsPolymorphicTypeUniquePtr) +{ + auto expected = new Derived{}; + + ASSERT_EQ(gko::as(std::unique_ptr{expected}).get(), + expected); +} + + +TEST(As, FailsToConvertUniquePtrIfNotRelated) +{ + auto expected = new Derived{}; + + ASSERT_THROW(gko::as(std::unique_ptr{expected}), + gko::NotSupported); +} + + +TEST(As, ConvertsPolymorphicTypeSharedPtr) +{ + auto expected = new Derived{}; + + ASSERT_EQ(gko::as(std::shared_ptr{expected}).get(), + expected); +} + + +TEST(As, FailsToConvertSharedPtrIfNotRelated) +{ + auto expected = new Derived{}; + + ASSERT_THROW(gko::as(std::shared_ptr{expected}), + gko::NotSupported); +} + + +TEST(As, ConvertsConstPolymorphicTypeSharedPtr) +{ + auto expected = new Derived{}; + + ASSERT_EQ(gko::as(std::shared_ptr{expected}).get(), + expected); +} + + +TEST(As, FailsToConvertConstSharedPtrIfNotRelated) +{ + auto expected = new Derived{}; + + ASSERT_THROW(gko::as(std::shared_ptr{expected}), + gko::NotSupported); +} + + +TEST(As, CanCrossCastUniquePtr) +{ + auto obj = std::unique_ptr(new MultipleDerived{}); + auto ptr = obj.get(); + auto base = gko::as(std::move(obj)); + + ASSERT_EQ(gko::as(gko::as(std::move(base))).get(), + ptr); +} + + +TEST(As, CanCrossCastSharedPtr) +{ + auto obj = std::make_shared(); + auto base = gko::as(obj); + + ASSERT_EQ(gko::as(gko::as(base)), base); +} + + +TEST(As, CanCrossCastConstSharedPtr) +{ + auto obj = std::make_shared(); + auto base = gko::as(obj); + + ASSERT_EQ(gko::as(gko::as(base)), base); } diff --git a/core/test/base/version.cpp b/core/test/base/version.cpp index ce209e05bb5..0fc86a03b7f 100644 --- a/core/test/base/version.cpp +++ b/core/test/base/version.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include namespace { diff --git a/core/test/factorization/CMakeLists.txt b/core/test/factorization/CMakeLists.txt index 16f1fe27d91..9b2e3082e51 100644 --- a/core/test/factorization/CMakeLists.txt +++ b/core/test/factorization/CMakeLists.txt @@ -1 +1,2 @@ ginkgo_create_test(par_ilu) +ginkgo_create_test(par_ilut) diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp index 5dc94c752b4..75c714fcab1 100644 --- a/core/test/factorization/par_ilu.cpp +++ b/core/test/factorization/par_ilu.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,13 +39,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class ParIlu : public ::testing::Test { public: - using value_type = gko::default_precision; - using index_type = gko::int32; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; using ilu_factory_type = gko::factorization::ParIlu; protected: @@ -54,29 +60,35 @@ class ParIlu : public ::testing::Test { std::shared_ptr ref; }; +TYPED_TEST_CASE(ParIlu, gko::test::ValueIndexTypes); + -TEST_F(ParIlu, SetIterations) +TYPED_TEST(ParIlu, SetIterations) { - auto factory = ilu_factory_type::build().with_iterations(5u).on(ref); + auto factory = + TestFixture::ilu_factory_type::build().with_iterations(5u).on( + this->ref); ASSERT_EQ(factory->get_parameters().iterations, 5u); } -TEST_F(ParIlu, SetSkip) +TYPED_TEST(ParIlu, SetSkip) { - auto factory = ilu_factory_type::build().with_skip_sorting(true).on(ref); + auto factory = + TestFixture::ilu_factory_type::build().with_skip_sorting(true).on( + this->ref); ASSERT_EQ(factory->get_parameters().skip_sorting, true); } -TEST_F(ParIlu, SetEverything) +TYPED_TEST(ParIlu, SetEverything) { - auto factory = ilu_factory_type::build() + auto factory = TestFixture::ilu_factory_type::build() .with_skip_sorting(false) .with_iterations(7u) - .on(ref); + .on(this->ref); ASSERT_EQ(factory->get_parameters().skip_sorting, false); ASSERT_EQ(factory->get_parameters().iterations, 7u); diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp new file mode 100644 index 00000000000..e33f0bc35b5 --- /dev/null +++ b/core/test/factorization/par_ilut.cpp @@ -0,0 +1,131 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +namespace { + + +class ParIlut : public ::testing::Test { +public: + using value_type = gko::default_precision; + using index_type = gko::int32; + using ilut_factory_type = + gko::factorization::ParIlut; + +protected: + ParIlut() : ref(gko::ReferenceExecutor::create()) {} + + std::shared_ptr ref; +}; + + +TEST_F(ParIlut, SetIterations) +{ + auto factory = ilut_factory_type::build().with_iterations(6u).on(ref); + + ASSERT_EQ(factory->get_parameters().iterations, 6u); +} + + +TEST_F(ParIlut, SetSkip) +{ + auto factory = ilut_factory_type::build().with_skip_sorting(true).on(ref); + + ASSERT_EQ(factory->get_parameters().skip_sorting, true); +} + + +TEST_F(ParIlut, SetApprox) +{ + auto factory = + ilut_factory_type::build().with_approximate_select(false).on(ref); + + ASSERT_EQ(factory->get_parameters().approximate_select, false); +} + + +TEST_F(ParIlut, SetDeterministic) +{ + auto factory = + ilut_factory_type::build().with_deterministic_sample(true).on(ref); + + ASSERT_EQ(factory->get_parameters().deterministic_sample, true); +} + + +TEST_F(ParIlut, SetFillIn) +{ + auto factory = ilut_factory_type::build().with_fill_in_limit(1.2).on(ref); + + ASSERT_EQ(factory->get_parameters().fill_in_limit, 1.2); +} + + +TEST_F(ParIlut, SetDefaults) +{ + auto factory = ilut_factory_type::build().on(ref); + + ASSERT_EQ(factory->get_parameters().skip_sorting, false); + ASSERT_EQ(factory->get_parameters().iterations, 5u); + ASSERT_EQ(factory->get_parameters().approximate_select, true); + ASSERT_EQ(factory->get_parameters().deterministic_sample, false); + ASSERT_EQ(factory->get_parameters().fill_in_limit, 2.0); +} + + +TEST_F(ParIlut, SetEverything) +{ + auto factory = ilut_factory_type::build() + .with_skip_sorting(true) + .with_iterations(7u) + .with_approximate_select(false) + .with_deterministic_sample(true) + .with_fill_in_limit(1.2) + .on(ref); + + ASSERT_EQ(factory->get_parameters().skip_sorting, true); + ASSERT_EQ(factory->get_parameters().iterations, 7u); + ASSERT_EQ(factory->get_parameters().approximate_select, false); + ASSERT_EQ(factory->get_parameters().deterministic_sample, true); + ASSERT_EQ(factory->get_parameters().fill_in_limit, 1.2); +} + + +} // namespace diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp index 116895d178f..bb05007817b 100644 --- a/core/test/log/convergence.cpp +++ b/core/test/log/convergence.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,13 +39,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template +class Convergence : public ::testing::Test {}; + +TYPED_TEST_CASE(Convergence, gko::test::ValueTypes); + -TEST(Record, CanGetData) +TYPED_TEST(Convergence, CanGetData) { auto exec = gko::ReferenceExecutor::create(); - auto logger = gko::log::Convergence<>::create( + auto logger = gko::log::Convergence::create( exec, gko::log::Logger::iteration_complete_mask); ASSERT_EQ(logger->get_num_iterations(), 0); diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp index 92e4cdfad5e..e051e16f692 100644 --- a/core/test/log/logger.cpp +++ b/core/test/log/logger.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,14 +31,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include -#include -#include -#include #include +#include + + +#include +#include + + namespace { diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index 676aaf5261b..d318a29f228 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,28 +30,32 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include +#include + + #include #include -#include -#include #include #include #include #include +#include "core/test/utils.hpp" + + namespace { +template class Papi : public ::testing::Test { protected: - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; Papi() : exec(gko::ReferenceExecutor::create()), eventset(PAPI_NULL) {} @@ -69,11 +73,11 @@ class Papi : public ::testing::Test { void TearDown() { eventset = PAPI_NULL; } - template + template const std::string init(const gko::log::Logger::mask_type &event, - const std::string &event_name, T *ptr) + const std::string &event_name, U *ptr) { - logger = gko::log::Papi<>::create(exec, event); + logger = gko::log::Papi::create(exec, event); std::ostringstream os; os << "sde:::" << logger->get_handle_name() << "::" << event_name << "::" << reinterpret_cast(ptr); @@ -110,352 +114,377 @@ class Papi : public ::testing::Test { } } - std::shared_ptr> logger; + std::shared_ptr> logger; std::shared_ptr exec; int eventset; }; +TYPED_TEST_CASE(Papi, gko::test::ValueTypes); + -TEST_F(Papi, CatchesAllocationStarted) +TYPED_TEST(Papi, CatchesAllocationStarted) { int logged_value = 42; - auto str = init(gko::log::Logger::allocation_started_mask, - "allocation_started", exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::allocation_started_mask, + "allocation_started", this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), logged_value); + this->start(); + this->logger->template on( + this->exec.get(), logged_value); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, logged_value); } -TEST_F(Papi, CatchesAllocationCompleted) +TYPED_TEST(Papi, CatchesAllocationCompleted) { int logged_value = 42; - auto str = init(gko::log::Logger::allocation_completed_mask, - "allocation_completed", exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::allocation_completed_mask, + "allocation_completed", this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), logged_value, - 0); + this->start(); + this->logger->template on( + this->exec.get(), logged_value, 0); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, logged_value); } -TEST_F(Papi, CatchesFreeStarted) +TYPED_TEST(Papi, CatchesFreeStarted) { - auto str = - init(gko::log::Logger::free_started_mask, "free_started", exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::free_started_mask, "free_started", + this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), 0); + this->start(); + this->logger->template on(this->exec.get(), + 0); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesFreeCompleted) +TYPED_TEST(Papi, CatchesFreeCompleted) { - auto str = init(gko::log::Logger::free_completed_mask, "free_completed", - exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::free_completed_mask, + "free_completed", this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), 0); + this->start(); + this->logger->template on( + this->exec.get(), 0); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesCopyStarted) +TYPED_TEST(Papi, CatchesCopyStarted) { auto logged_value = 42; - auto str = init(gko::log::Logger::copy_started_mask, "copy_started_from", - exec.get()); + auto str = this->init(gko::log::Logger::copy_started_mask, + "copy_started_from", this->exec.get()); std::ostringstream os_out; - os_out << "sde:::" << logger->get_handle_name() << "::copy_started_to::" - << reinterpret_cast(exec.get()); - add_event(str); - add_event(os_out.str()); - - start(); - logger->on(exec.get(), exec.get(), 0, 0, - logged_value); + os_out << "sde:::" << this->logger->get_handle_name() + << "::copy_started_to::" + << reinterpret_cast(this->exec.get()); + this->add_event(str); + this->add_event(os_out.str()); + + this->start(); + this->logger->template on( + this->exec.get(), this->exec.get(), 0, 0, logged_value); long long int values[2]; - stop(values); + this->stop(values); ASSERT_EQ(values[0], logged_value); ASSERT_EQ(values[1], logged_value); } -TEST_F(Papi, CatchesCopyCompleted) +TYPED_TEST(Papi, CatchesCopyCompleted) { auto logged_value = 42; - auto str = init(gko::log::Logger::copy_completed_mask, - "copy_completed_from", exec.get()); + auto str = this->init(gko::log::Logger::copy_completed_mask, + "copy_completed_from", this->exec.get()); std::ostringstream os_out; - os_out << "sde:::" << logger->get_handle_name() << "::copy_completed_to::" - << reinterpret_cast(exec.get()); - add_event(str); - add_event(os_out.str()); - - start(); - logger->on(exec.get(), exec.get(), 0, 0, - logged_value); + os_out << "sde:::" << this->logger->get_handle_name() + << "::copy_completed_to::" + << reinterpret_cast(this->exec.get()); + this->add_event(str); + this->add_event(os_out.str()); + + this->start(); + this->logger->template on( + this->exec.get(), this->exec.get(), 0, 0, logged_value); long long int values[2]; - stop(values); + this->stop(values); ASSERT_EQ(values[0], logged_value); ASSERT_EQ(values[1], logged_value); } -TEST_F(Papi, CatchesOperationLaunched) +TYPED_TEST(Papi, CatchesOperationLaunched) { - auto str = init(gko::log::Logger::operation_launched_mask, - "operation_launched", exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::operation_launched_mask, + "operation_launched", this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), nullptr); + this->start(); + this->logger->template on( + this->exec.get(), nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesOperationCompleted) +TYPED_TEST(Papi, CatchesOperationCompleted) { - auto str = init(gko::log::Logger::operation_completed_mask, - "operation_completed", exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::operation_completed_mask, + "operation_completed", this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), nullptr); + this->start(); + this->logger->template on( + this->exec.get(), nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesPolymorphicObjectCreateStarted) +TYPED_TEST(Papi, CatchesPolymorphicObjectCreateStarted) { - auto str = init(gko::log::Logger::polymorphic_object_create_started_mask, - "polymorphic_object_create_started", exec.get()); - add_event(str); - - start(); - logger->on(exec.get(), - nullptr); + auto str = + this->init(gko::log::Logger::polymorphic_object_create_started_mask, + "polymorphic_object_create_started", this->exec.get()); + this->add_event(str); + + this->start(); + this->logger + ->template on( + this->exec.get(), nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesPolymorphicObjectCreateCompleted) +TYPED_TEST(Papi, CatchesPolymorphicObjectCreateCompleted) { - auto str = init(gko::log::Logger::polymorphic_object_create_completed_mask, - "polymorphic_object_create_completed", exec.get()); - add_event(str); - - start(); - logger->on( - exec.get(), nullptr, nullptr); + auto str = + this->init(gko::log::Logger::polymorphic_object_create_completed_mask, + "polymorphic_object_create_completed", this->exec.get()); + this->add_event(str); + + this->start(); + this->logger + ->template on( + this->exec.get(), nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesPolymorphicObjectCopyStarted) +TYPED_TEST(Papi, CatchesPolymorphicObjectCopyStarted) { - auto str = init(gko::log::Logger::polymorphic_object_copy_started_mask, - "polymorphic_object_copy_started", exec.get()); - add_event(str); - - start(); - logger->on( - exec.get(), nullptr, nullptr); + auto str = + this->init(gko::log::Logger::polymorphic_object_copy_started_mask, + "polymorphic_object_copy_started", this->exec.get()); + this->add_event(str); + + this->start(); + this->logger + ->template on( + this->exec.get(), nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesPolymorphicObjectCopyCompleted) +TYPED_TEST(Papi, CatchesPolymorphicObjectCopyCompleted) { - auto str = init(gko::log::Logger::polymorphic_object_copy_completed_mask, - "polymorphic_object_copy_completed", exec.get()); - add_event(str); - - start(); - logger->on( - exec.get(), nullptr, nullptr); + auto str = + this->init(gko::log::Logger::polymorphic_object_copy_completed_mask, + "polymorphic_object_copy_completed", this->exec.get()); + this->add_event(str); + + this->start(); + this->logger + ->template on( + this->exec.get(), nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesPolymorphicObjectDeleted) +TYPED_TEST(Papi, CatchesPolymorphicObjectDeleted) { - auto str = init(gko::log::Logger::polymorphic_object_deleted_mask, - "polymorphic_object_deleted", exec.get()); - add_event(str); + auto str = this->init(gko::log::Logger::polymorphic_object_deleted_mask, + "polymorphic_object_deleted", this->exec.get()); + this->add_event(str); - start(); - logger->on(exec.get(), - nullptr); + this->start(); + this->logger->template on( + this->exec.get(), nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesLinOpApplyStarted) +TYPED_TEST(Papi, CatchesLinOpApplyStarted) { - auto A = Dense::create(exec); - auto str = init(gko::log::Logger::linop_apply_started_mask, - "linop_apply_started", A.get()); - add_event(str); - - start(); - logger->on(A.get(), nullptr, - nullptr); + using Dense = typename TestFixture::Dense; + auto A = Dense::create(this->exec); + auto str = this->init(gko::log::Logger::linop_apply_started_mask, + "linop_apply_started", A.get()); + this->add_event(str); + + this->start(); + this->logger->template on( + A.get(), nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesLinOpApplyCompleted) +TYPED_TEST(Papi, CatchesLinOpApplyCompleted) { - auto A = Dense::create(exec); - auto str = init(gko::log::Logger::linop_apply_completed_mask, - "linop_apply_completed", A.get()); - add_event(str); - - start(); - logger->on(A.get(), nullptr, - nullptr); + using Dense = typename TestFixture::Dense; + auto A = Dense::create(this->exec); + auto str = this->init(gko::log::Logger::linop_apply_completed_mask, + "linop_apply_completed", A.get()); + this->add_event(str); + + this->start(); + this->logger->template on( + A.get(), nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesLinOpAdvancedApplyStarted) +TYPED_TEST(Papi, CatchesLinOpAdvancedApplyStarted) { - auto A = Dense::create(exec); - auto str = init(gko::log::Logger::linop_advanced_apply_started_mask, - "linop_advanced_apply_started", A.get()); - add_event(str); - - start(); - logger->on( + using Dense = typename TestFixture::Dense; + auto A = Dense::create(this->exec); + auto str = this->init(gko::log::Logger::linop_advanced_apply_started_mask, + "linop_advanced_apply_started", A.get()); + this->add_event(str); + + this->start(); + this->logger->template on( A.get(), nullptr, nullptr, nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesLinOpAdvancedApplyCompleted) +TYPED_TEST(Papi, CatchesLinOpAdvancedApplyCompleted) { - auto A = Dense::create(exec); - auto str = init(gko::log::Logger::linop_advanced_apply_completed_mask, - "linop_advanced_apply_completed", A.get()); - add_event(str); - - start(); - logger->on( + using Dense = typename TestFixture::Dense; + auto A = Dense::create(this->exec); + auto str = this->init(gko::log::Logger::linop_advanced_apply_completed_mask, + "linop_advanced_apply_completed", A.get()); + this->add_event(str); + + this->start(); + this->logger->template on( A.get(), nullptr, nullptr, nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesLinOpFactoryGenerateStarted) +TYPED_TEST(Papi, CatchesLinOpFactoryGenerateStarted) { auto factory = - gko::solver::Bicgstab<>::build() + gko::solver::Bicgstab::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto str = init(gko::log::Logger::linop_factory_generate_started_mask, - "linop_factory_generate_started", factory.get()); - add_event(str); - - start(); - logger->on(factory.get(), - nullptr); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto str = this->init(gko::log::Logger::linop_factory_generate_started_mask, + "linop_factory_generate_started", factory.get()); + this->add_event(str); + + this->start(); + this->logger->template on( + factory.get(), nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesLinOpFactoryGenerateCompleted) +TYPED_TEST(Papi, CatchesLinOpFactoryGenerateCompleted) { auto factory = - gko::solver::Bicgstab<>::build() + gko::solver::Bicgstab::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto str = init(gko::log::Logger::linop_factory_generate_completed_mask, - "linop_factory_generate_completed", factory.get()); - add_event(str); - - start(); - logger->on( - factory.get(), nullptr, nullptr); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + TypeParam dummy; + auto str = + this->init(gko::log::Logger::linop_factory_generate_completed_mask, + "linop_factory_generate_completed", factory.get()); + this->add_event(str); + + this->start(); + this->logger + ->template on( + factory.get(), nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, 1); } -TEST_F(Papi, CatchesIterationComplete) +TYPED_TEST(Papi, CatchesIterationComplete) { + using Dense = typename TestFixture::Dense; int logged_value = 42; - auto A = Dense::create(exec); - auto str = init(gko::log::Logger::iteration_complete_mask, - "iteration_complete", A.get()); - add_event(str); - - start(); - logger->on(A.get(), 42, nullptr, - nullptr, nullptr); + auto A = Dense::create(this->exec); + auto str = this->init(gko::log::Logger::iteration_complete_mask, + "iteration_complete", A.get()); + this->add_event(str); + + this->start(); + this->logger->template on( + A.get(), 42, nullptr, nullptr, nullptr); long long int value = 0; - stop(&value); + this->stop(&value); ASSERT_EQ(value, logged_value); } diff --git a/core/test/log/record.cpp b/core/test/log/record.cpp index 369e12c6af9..dd829d39a8c 100644 --- a/core/test/log/record.cpp +++ b/core/test/log/record.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,13 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include +#include "core/test/utils/assertions.hpp" + + namespace { diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 4f22c49927c..163a54fd74a 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,32 +33,43 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include +#include + + +#include -#include #include #include #include #include +#include "core/test/utils.hpp" + + namespace { constexpr int num_iters = 10; -TEST(Stream, CatchesAllocationStarted) +template +class Stream : public ::testing::Test {}; + +TYPED_TEST_CASE(Stream, gko::test::ValueTypes); + + +TYPED_TEST(Stream, CatchesAllocationStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::allocation_started_mask, out); - logger->on(exec.get(), 42); + logger->template on(exec.get(), 42); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "allocation started on"); @@ -66,17 +77,17 @@ TEST(Stream, CatchesAllocationStarted) } -TEST(Stream, CatchesAllocationCompleted) +TYPED_TEST(Stream, CatchesAllocationCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::allocation_completed_mask, out); int dummy = 1; std::stringstream ptrstream; ptrstream << std::hex << "0x" << reinterpret_cast(&dummy); - logger->on( + logger->template on( exec.get(), 42, reinterpret_cast(&dummy)); auto os = out.str(); @@ -86,17 +97,17 @@ TEST(Stream, CatchesAllocationCompleted) } -TEST(Stream, CatchesFreeStarted) +TYPED_TEST(Stream, CatchesFreeStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::free_started_mask, out); int dummy = 1; std::stringstream ptrstream; ptrstream << std::hex << "0x" << reinterpret_cast(&dummy); - logger->on( + logger->template on( exec.get(), reinterpret_cast(&dummy)); auto os = out.str(); @@ -105,17 +116,17 @@ TEST(Stream, CatchesFreeStarted) } -TEST(Stream, CatchesFreeCompleted) +TYPED_TEST(Stream, CatchesFreeCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::free_completed_mask, out); int dummy = 1; std::stringstream ptrstream; ptrstream << std::hex << "0x" << reinterpret_cast(&dummy); - logger->on( + logger->template on( exec.get(), reinterpret_cast(&dummy)); auto os = out.str(); @@ -124,11 +135,11 @@ TEST(Stream, CatchesFreeCompleted) } -TEST(Stream, CatchesCopyStarted) +TYPED_TEST(Stream, CatchesCopyStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::copy_started_mask, out); int dummy_in = 1; int dummy_out = 1; @@ -139,7 +150,7 @@ TEST(Stream, CatchesCopyStarted) ptrstream_out << std::hex << "0x" << reinterpret_cast(&dummy_out); - logger->on( + logger->template on( exec.get(), exec.get(), reinterpret_cast(&dummy_in), reinterpret_cast(&dummy_out), 42); @@ -151,11 +162,11 @@ TEST(Stream, CatchesCopyStarted) } -TEST(Stream, CatchesCopyCompleted) +TYPED_TEST(Stream, CatchesCopyCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::copy_completed_mask, out); int dummy_in = 1; int dummy_out = 1; @@ -166,7 +177,7 @@ TEST(Stream, CatchesCopyCompleted) ptrstream_out << std::hex << "0x" << reinterpret_cast(&dummy_out); - logger->on( + logger->template on( exec.get(), exec.get(), reinterpret_cast(&dummy_in), reinterpret_cast(&dummy_out), 42); @@ -178,17 +189,17 @@ TEST(Stream, CatchesCopyCompleted) } -TEST(Stream, CatchesOperationLaunched) +TYPED_TEST(Stream, CatchesOperationLaunched) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::operation_launched_mask, out); gko::Operation op; std::stringstream ptrstream; ptrstream << &op; - logger->on(exec.get(), &op); + logger->template on(exec.get(), &op); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "started on"); @@ -196,17 +207,17 @@ TEST(Stream, CatchesOperationLaunched) } -TEST(Stream, CatchesOperationCompleted) +TYPED_TEST(Stream, CatchesOperationCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::operation_completed_mask, out); gko::Operation op; std::stringstream ptrstream; ptrstream << &op; - logger->on(exec.get(), &op); + logger->template on(exec.get(), &op); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "completed on"); @@ -214,18 +225,18 @@ TEST(Stream, CatchesOperationCompleted) } -TEST(Stream, CatchesPolymorphicObjectCreateStarted) +TYPED_TEST(Stream, CatchesPolymorphicObjectCreateStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::polymorphic_object_create_started_mask, out); - auto po = gko::matrix::Dense<>::create(exec); + auto po = gko::matrix::Dense::create(exec); std::stringstream ptrstream; ptrstream << po.get(); - logger->on(exec.get(), - po.get()); + logger->template on( + exec.get(), po.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, ptrstream.str()); @@ -233,20 +244,20 @@ TEST(Stream, CatchesPolymorphicObjectCreateStarted) } -TEST(Stream, CatchesPolymorphicObjectCreateCompleted) +TYPED_TEST(Stream, CatchesPolymorphicObjectCreateCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::polymorphic_object_create_completed_mask, out); - auto po = gko::matrix::Dense<>::create(exec); - auto output = gko::matrix::Dense<>::create(exec); + auto po = gko::matrix::Dense::create(exec); + auto output = gko::matrix::Dense::create(exec); std::stringstream ptrstream_in; ptrstream_in << po.get(); std::stringstream ptrstream_out; ptrstream_out << output.get(); - logger->on( + logger->template on( exec.get(), po.get(), output.get()); auto os = out.str(); @@ -256,20 +267,20 @@ TEST(Stream, CatchesPolymorphicObjectCreateCompleted) } -TEST(Stream, CatchesPolymorphicObjectCopyStarted) +TYPED_TEST(Stream, CatchesPolymorphicObjectCopyStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::polymorphic_object_copy_started_mask, out); - auto from = gko::matrix::Dense<>::create(exec); - auto to = gko::matrix::Dense<>::create(exec); + auto from = gko::matrix::Dense::create(exec); + auto to = gko::matrix::Dense::create(exec); std::stringstream ptrstream_from; ptrstream_from << from.get(); std::stringstream ptrstream_to; ptrstream_to << to.get(); - logger->on( + logger->template on( exec.get(), from.get(), to.get()); auto os = out.str(); @@ -279,20 +290,20 @@ TEST(Stream, CatchesPolymorphicObjectCopyStarted) } -TEST(Stream, CatchesPolymorphicObjectCopyCompleted) +TYPED_TEST(Stream, CatchesPolymorphicObjectCopyCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::polymorphic_object_copy_completed_mask, out); - auto from = gko::matrix::Dense<>::create(exec); - auto to = gko::matrix::Dense<>::create(exec); + auto from = gko::matrix::Dense::create(exec); + auto to = gko::matrix::Dense::create(exec); std::stringstream ptrstream_from; ptrstream_from << from.get(); std::stringstream ptrstream_to; ptrstream_to << to.get(); - logger->on( + logger->template on( exec.get(), from.get(), to.get()); auto os = out.str(); @@ -302,18 +313,18 @@ TEST(Stream, CatchesPolymorphicObjectCopyCompleted) } -TEST(Stream, CatchesPolymorphicObjectDeleted) +TYPED_TEST(Stream, CatchesPolymorphicObjectDeleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::polymorphic_object_deleted_mask, out); - auto po = gko::matrix::Dense<>::create(exec); + auto po = gko::matrix::Dense::create(exec); std::stringstream ptrstream; ptrstream << po.get(); - logger->on(exec.get(), - po.get()); + logger->template on( + exec.get(), po.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, ptrstream.str()); @@ -321,12 +332,12 @@ TEST(Stream, CatchesPolymorphicObjectDeleted) } -TEST(Stream, CatchesLinOpApplyStarted) +TYPED_TEST(Stream, CatchesLinOpApplyStarted) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_apply_started_mask, out); auto A = Dense::create(exec); auto b = Dense::create(exec); @@ -338,8 +349,8 @@ TEST(Stream, CatchesLinOpApplyStarted) std::stringstream ptrstream_x; ptrstream_x << x.get(); - logger->on(A.get(), b.get(), - x.get()); + logger->template on(A.get(), b.get(), + x.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "apply started on A"); @@ -349,19 +360,19 @@ TEST(Stream, CatchesLinOpApplyStarted) } -TEST(Stream, CatchesLinOpApplyStartedWithVerbose) +TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_apply_started_mask, out, true); auto A = gko::initialize({1.1}, exec); auto b = gko::initialize({-2.2}, exec); auto x = gko::initialize({3.3}, exec); - logger->on(A.get(), b.get(), - x.get()); + logger->template on(A.get(), b.get(), + x.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "1.1"); @@ -370,12 +381,12 @@ TEST(Stream, CatchesLinOpApplyStartedWithVerbose) } -TEST(Stream, CatchesLinOpApplyCompleted) +TYPED_TEST(Stream, CatchesLinOpApplyCompleted) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_apply_completed_mask, out); auto A = Dense::create(exec); auto b = Dense::create(exec); @@ -387,8 +398,8 @@ TEST(Stream, CatchesLinOpApplyCompleted) std::stringstream ptrstream_x; ptrstream_x << x.get(); - logger->on(A.get(), b.get(), - x.get()); + logger->template on( + A.get(), b.get(), x.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "apply completed on A"); @@ -398,19 +409,19 @@ TEST(Stream, CatchesLinOpApplyCompleted) } -TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) +TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_apply_completed_mask, out, true); auto A = gko::initialize({1.1}, exec); auto b = gko::initialize({-2.2}, exec); auto x = gko::initialize({3.3}, exec); - logger->on(A.get(), b.get(), - x.get()); + logger->template on( + A.get(), b.get(), x.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "1.1"); @@ -419,12 +430,12 @@ TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) } -TEST(Stream, CatchesLinOpAdvancedApplyStarted) +TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStarted) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_advanced_apply_started_mask, out); auto A = Dense::create(exec); auto alpha = Dense::create(exec); @@ -442,7 +453,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyStarted) std::stringstream ptrstream_x; ptrstream_x << x.get(); - logger->on( + logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); @@ -455,12 +466,12 @@ TEST(Stream, CatchesLinOpAdvancedApplyStarted) } -TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) +TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_advanced_apply_started_mask, out, true); auto A = gko::initialize({1.1}, exec); auto alpha = gko::initialize({-4.4}, exec); @@ -468,7 +479,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) auto beta = gko::initialize({-5.5}, exec); auto x = gko::initialize({3.3}, exec); - logger->on( + logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); @@ -480,12 +491,12 @@ TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) } -TEST(Stream, CatchesLinOpAdvancedApplyCompleted) +TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompleted) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_advanced_apply_completed_mask, out); auto A = Dense::create(exec); auto alpha = Dense::create(exec); @@ -503,7 +514,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompleted) std::stringstream ptrstream_x; ptrstream_x << x.get(); - logger->on( + logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); @@ -516,12 +527,12 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompleted) } -TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) +TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_advanced_apply_completed_mask, out, true); auto A = gko::initialize({1.1}, exec); auto alpha = gko::initialize({-4.4}, exec); @@ -529,7 +540,7 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) auto beta = gko::initialize({-5.5}, exec); auto x = gko::initialize({3.3}, exec); - logger->on( + logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); @@ -541,25 +552,25 @@ TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) } -TEST(Stream, CatchesLinopFactoryGenerateStarted) +TYPED_TEST(Stream, CatchesLinopFactoryGenerateStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_factory_generate_started_mask, out); auto factory = - gko::solver::Bicgstab<>::build() + gko::solver::Bicgstab::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec)) .on(exec); - auto input = factory->generate(gko::matrix::Dense<>::create(exec)); + auto input = factory->generate(gko::matrix::Dense::create(exec)); std::stringstream ptrstream_factory; ptrstream_factory << factory.get(); std::stringstream ptrstream_input; ptrstream_input << input.get(); - logger->on(factory.get(), - input.get()); + logger->template on( + factory.get(), input.get()); auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "generate started for"); @@ -568,19 +579,20 @@ TEST(Stream, CatchesLinopFactoryGenerateStarted) } -TEST(Stream, CatchesLinopFactoryGenerateCompleted) +TYPED_TEST(Stream, CatchesLinopFactoryGenerateCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::linop_factory_generate_completed_mask, out); auto factory = - gko::solver::Bicgstab<>::build() + gko::solver::Bicgstab::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec)) .on(exec); - auto input = factory->generate(gko::matrix::Dense<>::create(exec)); - auto output = factory->generate(gko::matrix::Dense<>::create(exec)); + auto input = factory->generate(gko::matrix::Dense::create(exec)); + auto output = + factory->generate(gko::matrix::Dense::create(exec)); std::stringstream ptrstream_factory; ptrstream_factory << factory.get(); std::stringstream ptrstream_input; @@ -588,7 +600,7 @@ TEST(Stream, CatchesLinopFactoryGenerateCompleted) std::stringstream ptrstream_output; ptrstream_output << output.get(); - logger->on( + logger->template on( factory.get(), input.get(), output.get()); auto os = out.str(); @@ -599,11 +611,11 @@ TEST(Stream, CatchesLinopFactoryGenerateCompleted) } -TEST(Stream, CatchesCriterionCheckStarted) +TYPED_TEST(Stream, CatchesCriterionCheckStarted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::criterion_check_started_mask, out); auto criterion = gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate( @@ -614,7 +626,7 @@ TEST(Stream, CatchesCriterionCheckStarted) std::stringstream true_in_stream; true_in_stream << true; - logger->on( + logger->template on( criterion.get(), 1, nullptr, nullptr, nullptr, RelativeStoppingId, true); @@ -626,11 +638,11 @@ TEST(Stream, CatchesCriterionCheckStarted) } -TEST(Stream, CatchesCriterionCheckCompleted) +TYPED_TEST(Stream, CatchesCriterionCheckCompleted) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::criterion_check_completed_mask, out); auto criterion = gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate( @@ -642,7 +654,7 @@ TEST(Stream, CatchesCriterionCheckCompleted) std::stringstream true_in_stream; true_in_stream << true; - logger->on( + logger->template on( criterion.get(), 1, nullptr, nullptr, nullptr, RelativeStoppingId, true, &stop_status, true, true); @@ -657,11 +669,11 @@ TEST(Stream, CatchesCriterionCheckCompleted) } -TEST(Stream, CatchesCriterionCheckCompletedWithVerbose) +TYPED_TEST(Stream, CatchesCriterionCheckCompletedWithVerbose) { auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::criterion_check_completed_mask, out, true); auto criterion = gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate( @@ -673,7 +685,7 @@ TEST(Stream, CatchesCriterionCheckCompletedWithVerbose) stop_status.get_data()->reset(); stop_status.get_data()->stop(RelativeStoppingId); - logger->on( + logger->template on( criterion.get(), 1, nullptr, nullptr, nullptr, RelativeStoppingId, true, &stop_status, true, true); @@ -685,12 +697,12 @@ TEST(Stream, CatchesCriterionCheckCompletedWithVerbose) } -TEST(Stream, CatchesIterations) +TYPED_TEST(Stream, CatchesIterations) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::iteration_complete_mask, out); auto solver = Dense::create(exec); auto residual = Dense::create(exec); @@ -701,25 +713,26 @@ TEST(Stream, CatchesIterations) std::stringstream ptrstream_residual; ptrstream_residual << residual.get(); - logger->on(solver.get(), num_iters, - residual.get()); + logger->template on( + solver.get(), num_iters, residual.get()); - GKO_ASSERT_STR_CONTAINS(out.str(), "iteration " + num_iters); + GKO_ASSERT_STR_CONTAINS(out.str(), + "iteration " + std::to_string(num_iters)); GKO_ASSERT_STR_CONTAINS(out.str(), ptrstream_solver.str()); GKO_ASSERT_STR_CONTAINS(out.str(), ptrstream_residual.str()); } -TEST(Stream, CatchesIterationsWithVerbose) +TYPED_TEST(Stream, CatchesIterationsWithVerbose) { - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); std::stringstream out; - auto logger = gko::log::Stream<>::create( + auto logger = gko::log::Stream::create( exec, gko::log::Logger::iteration_complete_mask, out, true); auto factory = - gko::solver::Bicgstab<>::build() + gko::solver::Bicgstab::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec)) .on(exec); @@ -728,7 +741,7 @@ TEST(Stream, CatchesIterationsWithVerbose) auto solution = gko::initialize({-2.2}, exec); auto residual_norm = gko::initialize({-3.3}, exec); - logger->on( + logger->template on( solver.get(), num_iters, residual.get(), solution.get(), residual_norm.get()); diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt index ba370edb60f..68382fa4b8f 100644 --- a/core/test/matrix/CMakeLists.txt +++ b/core/test/matrix/CMakeLists.txt @@ -4,5 +4,8 @@ ginkgo_create_test(dense) ginkgo_create_test(ell) ginkgo_create_test(hybrid) ginkgo_create_test(identity) +ginkgo_create_test(permutation) ginkgo_create_test(sellp) ginkgo_create_test(sparsity_csr) +ginkgo_create_test(csr_builder) +ginkgo_create_test(coo_builder) \ No newline at end of file diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp index 3f78fa76ad5..92a999febb6 100644 --- a/core/test/matrix/coo.cpp +++ b/core/test/matrix/coo.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,20 +36,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Coo : public ::testing::Test { protected: - using Mtx = gko::matrix::Coo<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Coo; Coo() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::Coo<>::create(exec, gko::dim<2>{2, 3}, 4)) + mtx(gko::matrix::Coo::create( + exec, gko::dim<2>{2, 3}, 4)) { - Mtx::value_type *v = mtx->get_values(); - Mtx::index_type *c = mtx->get_col_idxs(); - Mtx::index_type *r = mtx->get_row_idxs(); + value_type *v = mtx->get_values(); + index_type *c = mtx->get_col_idxs(); + index_type *r = mtx->get_row_idxs(); r[0] = 0; r[1] = 0; r[2] = 0; @@ -82,10 +91,10 @@ class Coo : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{3.0}); + EXPECT_EQ(v[2], value_type{2.0}); + EXPECT_EQ(v[3], value_type{5.0}); } void assert_empty(const Mtx *m) @@ -98,35 +107,44 @@ class Coo : public ::testing::Test { } }; +TYPED_TEST_CASE(Coo, gko::test::ValueIndexTypes); -TEST_F(Coo, KnowsItsSize) + +TYPED_TEST(Coo, KnowsItsSize) { - ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(mtx->get_num_stored_elements(), 4); + ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx->get_num_stored_elements(), 4); } -TEST_F(Coo, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); } +TYPED_TEST(Coo, ContainsCorrectData) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} -TEST_F(Coo, CanBeEmpty) +TYPED_TEST(Coo, CanBeEmpty) { - auto mtx = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec); - assert_empty(mtx.get()); + this->assert_empty(mtx.get()); } -TEST_F(Coo, CanBeCreatedFromExistingData) +TYPED_TEST(Coo, CanBeCreatedFromExistingData) { - double values[] = {1.0, 2.0, 3.0, 4.0}; - gko::int32 col_idxs[] = {0, 1, 1, 0}; - gko::int32 row_idxs[] = {0, 0, 1, 2}; - - auto mtx = gko::matrix::Coo<>::create( - exec, gko::dim<2>{3, 2}, gko::Array::view(exec, 4, values), - gko::Array::view(exec, 4, col_idxs), - gko::Array::view(exec, 4, row_idxs)); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + value_type values[] = {1.0, 2.0, 3.0, 4.0}; + index_type col_idxs[] = {0, 1, 1, 0}; + index_type row_idxs[] = {0, 0, 1, 2}; + + auto mtx = gko::matrix::Coo::create( + this->exec, gko::dim<2>{3, 2}, + gko::Array::view(this->exec, 4, values), + gko::Array::view(this->exec, 4, col_idxs), + gko::Array::view(this->exec, 4, row_idxs)); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); @@ -134,48 +152,53 @@ TEST_F(Coo, CanBeCreatedFromExistingData) } -TEST_F(Coo, CanBeCopied) +TYPED_TEST(Coo, CanBeCopied) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(mtx.get()); + copy->copy_from(this->mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Coo, CanBeMoved) +TYPED_TEST(Coo, CanBeMoved) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(std::move(mtx)); + copy->copy_from(std::move(this->mtx)); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Coo, CanBeCloned) +TYPED_TEST(Coo, CanBeCloned) { - auto clone = mtx->clone(); + using Mtx = typename TestFixture::Mtx; + auto clone = this->mtx->clone(); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(dynamic_cast(clone.get())); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(dynamic_cast(clone.get())); } -TEST_F(Coo, CanBeCleared) + +TYPED_TEST(Coo, CanBeCleared) { - mtx->clear(); + this->mtx->clear(); - assert_empty(mtx.get()); + this->assert_empty(this->mtx.get()); } -TEST_F(Coo, CanBeReadFromMatrixData) +TYPED_TEST(Coo, CanBeReadFromMatrixData) { - auto m = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -184,23 +207,26 @@ TEST_F(Coo, CanBeReadFromMatrixData) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_mtx(m.get()); } -TEST_F(Coo, GeneratesCorrectMatrixData) +TYPED_TEST(Coo, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 4); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0})); } diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp new file mode 100644 index 00000000000..de5844b0bbe --- /dev/null +++ b/core/test/matrix/coo_builder.cpp @@ -0,0 +1,88 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/coo_builder.hpp" + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class CooBuilder : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Coo; + + CooBuilder() + : exec(gko::ReferenceExecutor::create()), + mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4)) + {} + + std::shared_ptr exec; + std::unique_ptr mtx; +}; + +TYPED_TEST_CASE(CooBuilder, gko::test::ValueIndexTypes); + + +TYPED_TEST(CooBuilder, ReturnsCorrectArrays) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + gko::matrix::CooBuilder builder{this->mtx.get()}; + + auto builder_row_idxs = builder.get_row_idx_array().get_data(); + auto builder_col_idxs = builder.get_col_idx_array().get_data(); + auto builder_values = builder.get_value_array().get_data(); + auto ref_row_idxs = this->mtx->get_row_idxs(); + auto ref_col_idxs = this->mtx->get_col_idxs(); + auto ref_values = this->mtx->get_values(); + + ASSERT_EQ(builder_row_idxs, ref_row_idxs); + ASSERT_EQ(builder_col_idxs, ref_col_idxs); + ASSERT_EQ(builder_values, ref_values); +} + + +} // namespace diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp index 155f2c8ce21..f927861afce 100644 --- a/core/test/matrix/csr.cpp +++ b/core/test/matrix/csr.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,23 +36,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Csr : public ::testing::Test { protected: - using Mtx = gko::matrix::Csr<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Csr; Csr() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::Csr<>::create( + mtx(gko::matrix::Csr::create( exec, gko::dim<2>{2, 3}, 4, - std::make_shared(2))) + std::make_shared(2))) { - Mtx::value_type *v = mtx->get_values(); - Mtx::index_type *c = mtx->get_col_idxs(); - Mtx::index_type *r = mtx->get_row_ptrs(); - Mtx::index_type *s = mtx->get_srow(); + value_type *v = mtx->get_values(); + index_type *c = mtx->get_col_idxs(); + index_type *r = mtx->get_row_ptrs(); + index_type *s = mtx->get_srow(); r[0] = 0; r[1] = 3; r[2] = 4; @@ -85,10 +93,10 @@ class Csr : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{3.0}); + EXPECT_EQ(v[2], value_type{2.0}); + EXPECT_EQ(v[3], value_type{5.0}); EXPECT_EQ(s[0], 0); } @@ -98,41 +106,51 @@ class Csr : public ::testing::Test { ASSERT_EQ(m->get_num_stored_elements(), 0); ASSERT_EQ(m->get_const_values(), nullptr); ASSERT_EQ(m->get_const_col_idxs(), nullptr); - ASSERT_EQ(m->get_const_row_ptrs(), nullptr); + ASSERT_NE(m->get_const_row_ptrs(), nullptr); ASSERT_EQ(m->get_const_srow(), nullptr); } }; +TYPED_TEST_CASE(Csr, gko::test::ValueIndexTypes); + -TEST_F(Csr, KnowsItsSize) +TYPED_TEST(Csr, KnowsItsSize) { - ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(mtx->get_num_stored_elements(), 4); + ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx->get_num_stored_elements(), 4); } -TEST_F(Csr, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); } +TYPED_TEST(Csr, ContainsCorrectData) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} -TEST_F(Csr, CanBeEmpty) +TYPED_TEST(Csr, CanBeEmpty) { - auto mtx = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec); - assert_empty(mtx.get()); + this->assert_empty(mtx.get()); } -TEST_F(Csr, CanBeCreatedFromExistingData) +TYPED_TEST(Csr, CanBeCreatedFromExistingData) { - double values[] = {1.0, 2.0, 3.0, 4.0}; - gko::int32 col_idxs[] = {0, 1, 1, 0}; - gko::int32 row_ptrs[] = {0, 2, 3, 4}; - - auto mtx = gko::matrix::Csr<>::create( - exec, gko::dim<2>{3, 2}, gko::Array::view(exec, 4, values), - gko::Array::view(exec, 4, col_idxs), - gko::Array::view(exec, 4, row_ptrs), - std::make_shared(2)); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + value_type values[] = {1.0, 2.0, 3.0, 4.0}; + index_type col_idxs[] = {0, 1, 1, 0}; + index_type row_ptrs[] = {0, 2, 3, 4}; + + auto mtx = gko::matrix::Csr::create( + this->exec, gko::dim<2>{3, 2}, + gko::Array::view(this->exec, 4, values), + gko::Array::view(this->exec, 4, col_idxs), + gko::Array::view(this->exec, 4, row_ptrs), + std::make_shared(2)); ASSERT_EQ(mtx->get_num_srow_elements(), 1); ASSERT_EQ(mtx->get_const_values(), values); @@ -142,49 +160,54 @@ TEST_F(Csr, CanBeCreatedFromExistingData) } -TEST_F(Csr, CanBeCopied) +TYPED_TEST(Csr, CanBeCopied) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(mtx.get()); + copy->copy_from(this->mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Csr, CanBeMoved) +TYPED_TEST(Csr, CanBeMoved) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(std::move(mtx)); + copy->copy_from(std::move(this->mtx)); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Csr, CanBeCloned) +TYPED_TEST(Csr, CanBeCloned) { - auto clone = mtx->clone(); + using Mtx = typename TestFixture::Mtx; + auto clone = this->mtx->clone(); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(dynamic_cast(clone.get())); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(dynamic_cast(clone.get())); } -TEST_F(Csr, CanBeCleared) +TYPED_TEST(Csr, CanBeCleared) { - mtx->clear(); + this->mtx->clear(); - assert_empty(mtx.get()); + this->assert_empty(this->mtx.get()); } -TEST_F(Csr, CanBeReadFromMatrixData) +TYPED_TEST(Csr, CanBeReadFromMatrixData) { - auto m = Mtx::create(exec, std::make_shared(2)); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec, + std::make_shared(2)); m->read({{2, 3}, {{0, 0, 1.0}, @@ -194,23 +217,25 @@ TEST_F(Csr, CanBeReadFromMatrixData) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_mtx(m.get()); } -TEST_F(Csr, GeneratesCorrectMatrixData) +TYPED_TEST(Csr, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 4); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0})); } diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp new file mode 100644 index 00000000000..9a1bfb6eb5f --- /dev/null +++ b/core/test/matrix/csr_builder.cpp @@ -0,0 +1,119 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/csr_builder.hpp" + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class CsrBuilder : public ::testing::Test { +public: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Csr; + +protected: + CsrBuilder() + : exec(gko::ReferenceExecutor::create()), + mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4)) + {} + + std::shared_ptr exec; + std::unique_ptr mtx; +}; + +TYPED_TEST_CASE(CsrBuilder, gko::test::ValueIndexTypes); + + +TYPED_TEST(CsrBuilder, ReturnsCorrectArrays) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + gko::matrix::CsrBuilder builder{this->mtx.get()}; + + auto builder_col_idxs = builder.get_col_idx_array().get_data(); + auto builder_values = builder.get_value_array().get_data(); + auto ref_col_idxs = this->mtx->get_col_idxs(); + auto ref_values = this->mtx->get_values(); + + ASSERT_EQ(builder_col_idxs, ref_col_idxs); + ASSERT_EQ(builder_values, ref_values); +} + + +TYPED_TEST(CsrBuilder, UpdatesSrowOnDestruction) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + struct mock_strategy : public Mtx::strategy_type { + virtual void process(const gko::Array &, + gko::Array *) override + { + *was_called = true; + } + + virtual int64_t clac_size(const int64_t nnz) override { return 0; } + + virtual std::shared_ptr copy() override + { + return std::make_shared(*was_called); + } + + mock_strategy(bool &flag) : Mtx::strategy_type(""), was_called(&flag) {} + + bool *was_called; + }; + bool was_called{}; + this->mtx->set_strategy(std::make_shared(was_called)); + was_called = false; + + gko::matrix::CsrBuilder{this->mtx.get()}; + + ASSERT_TRUE(was_called); +} + + +} // namespace diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index eec17387754..c89f9a740e5 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,59 +40,67 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Dense : public ::testing::Test { protected: + using value_type = T; Dense() : exec(gko::ReferenceExecutor::create()), - mtx(gko::initialize>( + mtx(gko::initialize>( 4, {{1.0, 2.0, 3.0}, {1.5, 2.5, 3.5}}, exec)) {} - static void assert_equal_to_original_mtx(gko::matrix::Dense<> *m) + static void assert_equal_to_original_mtx(gko::matrix::Dense *m) { ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(m->get_stride(), 4); ASSERT_EQ(m->get_num_stored_elements(), 2 * 4); - EXPECT_EQ(m->at(0, 0), 1.0); - EXPECT_EQ(m->at(0, 1), 2.0); - EXPECT_EQ(m->at(0, 2), 3.0); - EXPECT_EQ(m->at(1, 0), 1.5); - EXPECT_EQ(m->at(1, 1), 2.5); - ASSERT_EQ(m->at(1, 2), 3.5); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{2.0}); + EXPECT_EQ(m->at(0, 2), value_type{3.0}); + EXPECT_EQ(m->at(1, 0), value_type{1.5}); + EXPECT_EQ(m->at(1, 1), value_type{2.5}); + ASSERT_EQ(m->at(1, 2), value_type{3.5}); } - static void assert_empty(gko::matrix::Dense<> *m) + static void assert_empty(gko::matrix::Dense *m) { ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0)); ASSERT_EQ(m->get_num_stored_elements(), 0); } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; }; +TYPED_TEST_CASE(Dense, gko::test::ValueTypes); -TEST_F(Dense, CanBeEmpty) + +TYPED_TEST(Dense, CanBeEmpty) { - auto empty = gko::matrix::Dense<>::create(exec); - assert_empty(empty.get()); + auto empty = gko::matrix::Dense::create(this->exec); + this->assert_empty(empty.get()); } -TEST_F(Dense, ReturnsNullValuesArrayWhenEmpty) +TYPED_TEST(Dense, ReturnsNullValuesArrayWhenEmpty) { - auto empty = gko::matrix::Dense<>::create(exec); + auto empty = gko::matrix::Dense::create(this->exec); ASSERT_EQ(empty->get_const_values(), nullptr); } -TEST_F(Dense, CanBeConstructedWithSize) +TYPED_TEST(Dense, CanBeConstructedWithSize) { - auto m = gko::matrix::Dense<>::create(exec, gko::dim<2>{2, 3}); + auto m = + gko::matrix::Dense::create(this->exec, gko::dim<2>{2, 3}); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); EXPECT_EQ(m->get_stride(), 3); @@ -100,9 +108,10 @@ TEST_F(Dense, CanBeConstructedWithSize) } -TEST_F(Dense, CanBeConstructedWithSizeAndStride) +TYPED_TEST(Dense, CanBeConstructedWithSizeAndStride) { - auto m = gko::matrix::Dense<>::create(exec, gko::dim<2>{2, 3}, 4); + auto m = + gko::matrix::Dense::create(this->exec, gko::dim<2>{2, 3}, 4); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); EXPECT_EQ(m->get_stride(), 4); @@ -110,172 +119,187 @@ TEST_F(Dense, CanBeConstructedWithSizeAndStride) } -TEST_F(Dense, CanBeConstructedFromExistingData) +TYPED_TEST(Dense, CanBeConstructedFromExistingData) { + using value_type = typename TestFixture::value_type; // clang-format off - double data[] = { + value_type data[] = { 1.0, 2.0, -1.0, 3.0, 4.0, -1.0, 5.0, 6.0, -1.0}; // clang-format on - auto m = gko::matrix::Dense<>::create( - exec, gko::dim<2>{3, 2}, gko::Array::view(exec, 9, data), 3); + auto m = gko::matrix::Dense::create( + this->exec, gko::dim<2>{3, 2}, + gko::Array::view(this->exec, 9, data), 3); ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(2, 1), 6.0); + ASSERT_EQ(m->at(2, 1), value_type{6.0}); } -TEST_F(Dense, KnowsItsSizeAndValues) +TYPED_TEST(Dense, KnowsItsSizeAndValues) { - assert_equal_to_original_mtx(mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); } -TEST_F(Dense, CanBeListConstructed) +TYPED_TEST(Dense, CanBeListConstructed) { - auto m = gko::initialize>({1.0, 2.0}, exec); + using value_type = typename TestFixture::value_type; + auto m = + gko::initialize>({1.0, 2.0}, this->exec); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 1)); ASSERT_EQ(m->get_num_stored_elements(), 2); - EXPECT_EQ(m->at(0), 1); - EXPECT_EQ(m->at(1), 2); + EXPECT_EQ(m->at(0), value_type{1}); + EXPECT_EQ(m->at(1), value_type{2}); } -TEST_F(Dense, CanBeListConstructedWithstride) +TYPED_TEST(Dense, CanBeListConstructedWithstride) { - auto m = gko::initialize>(2, {1.0, 2.0}, exec); + using value_type = typename TestFixture::value_type; + auto m = gko::initialize>(2, {1.0, 2.0}, + this->exec); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 1)); ASSERT_EQ(m->get_num_stored_elements(), 4); - EXPECT_EQ(m->at(0), 1.0); - EXPECT_EQ(m->at(1), 2.0); + EXPECT_EQ(m->at(0), value_type{1.0}); + EXPECT_EQ(m->at(1), value_type{2.0}); } -TEST_F(Dense, CanBeDoubleListConstructed) +TYPED_TEST(Dense, CanBeDoubleListConstructed) { - auto m = gko::initialize>( - {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, exec); + using value_type = typename TestFixture::value_type; + using T = value_type; + auto m = gko::initialize>( + {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, this->exec); ASSERT_EQ(m->get_size(), gko::dim<2>(3, 2)); ASSERT_EQ(m->get_num_stored_elements(), 6); - EXPECT_EQ(m->at(0), 1.0); - EXPECT_EQ(m->at(1), 2.0); - EXPECT_EQ(m->at(2), 3.0); - ASSERT_EQ(m->at(3), 4.0); - EXPECT_EQ(m->at(4), 5.0); + EXPECT_EQ(m->at(0), value_type{1.0}); + EXPECT_EQ(m->at(1), value_type{2.0}); + EXPECT_EQ(m->at(2), value_type{3.0}); + ASSERT_EQ(m->at(3), value_type{4.0}); + EXPECT_EQ(m->at(4), value_type{5.0}); } -TEST_F(Dense, CanBeDoubleListConstructedWithstride) +TYPED_TEST(Dense, CanBeDoubleListConstructedWithstride) { - auto m = gko::initialize>( - 4, {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, exec); + using value_type = typename TestFixture::value_type; + using T = value_type; + auto m = gko::initialize>( + 4, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, this->exec); ASSERT_EQ(m->get_size(), gko::dim<2>(3, 2)); ASSERT_EQ(m->get_num_stored_elements(), 12); - EXPECT_EQ(m->at(0), 1.0); - EXPECT_EQ(m->at(1), 2.0); - EXPECT_EQ(m->at(2), 3.0); - ASSERT_EQ(m->at(3), 4.0); - EXPECT_EQ(m->at(4), 5.0); + EXPECT_EQ(m->at(0), value_type{1.0}); + EXPECT_EQ(m->at(1), value_type{2.0}); + EXPECT_EQ(m->at(2), value_type{3.0}); + ASSERT_EQ(m->at(3), value_type{4.0}); + EXPECT_EQ(m->at(4), value_type{5.0}); } -TEST_F(Dense, CanBeCopied) +TYPED_TEST(Dense, CanBeCopied) { - auto mtx_copy = gko::matrix::Dense<>::create(exec); - mtx_copy->copy_from(mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - mtx->at(0) = 7; - assert_equal_to_original_mtx(mtx_copy.get()); + auto mtx_copy = gko::matrix::Dense::create(this->exec); + mtx_copy->copy_from(this->mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->at(0) = 7; + this->assert_equal_to_original_mtx(mtx_copy.get()); } -TEST_F(Dense, CanBeMoved) +TYPED_TEST(Dense, CanBeMoved) { - auto mtx_copy = gko::matrix::Dense<>::create(exec); - mtx_copy->copy_from(std::move(mtx)); - assert_equal_to_original_mtx(mtx_copy.get()); + auto mtx_copy = gko::matrix::Dense::create(this->exec); + mtx_copy->copy_from(std::move(this->mtx)); + this->assert_equal_to_original_mtx(mtx_copy.get()); } -TEST_F(Dense, CanBeCloned) +TYPED_TEST(Dense, CanBeCloned) { - auto mtx_clone = mtx->clone(); - assert_equal_to_original_mtx( - dynamic_cast(mtx_clone.get())); + auto mtx_clone = this->mtx->clone(); + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); } -TEST_F(Dense, CanBeCleared) +TYPED_TEST(Dense, CanBeCleared) { - mtx->clear(); - assert_empty(mtx.get()); + this->mtx->clear(); + this->assert_empty(this->mtx.get()); } -TEST_F(Dense, CanBeReadFromMatrixData) +TYPED_TEST(Dense, CanBeReadFromMatrixData) { - auto m = gko::matrix::Dense<>::create(exec); - m->read(gko::matrix_data<>{{2, 3}, - {{0, 0, 1.0}, - {0, 1, 3.0}, - {0, 2, 2.0}, - {1, 0, 0.0}, - {1, 1, 5.0}, - {1, 2, 0.0}}}); + using value_type = typename TestFixture::value_type; + auto m = gko::matrix::Dense::create(this->exec); + m->read(gko::matrix_data{{2, 3}, + {{0, 0, 1.0}, + {0, 1, 3.0}, + {0, 2, 2.0}, + {1, 0, 0.0}, + {1, 1, 5.0}, + {1, 2, 0.0}}}); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(m->get_num_stored_elements(), 6); - EXPECT_EQ(m->at(0, 0), 1.0); - EXPECT_EQ(m->at(1, 0), 0.0); - EXPECT_EQ(m->at(0, 1), 3.0); - EXPECT_EQ(m->at(1, 1), 5.0); - EXPECT_EQ(m->at(0, 2), 2.0); - ASSERT_EQ(m->at(1, 2), 0.0); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 1), value_type{3.0}); + EXPECT_EQ(m->at(1, 1), value_type{5.0}); + EXPECT_EQ(m->at(0, 2), value_type{2.0}); + ASSERT_EQ(m->at(1, 2), value_type{0.0}); } -TEST_F(Dense, GeneratesCorrectMatrixData) +TYPED_TEST(Dense, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 6); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 2.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 3.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 0, 1.5)); - EXPECT_EQ(data.nonzeros[4], tpl(1, 1, 2.5)); - EXPECT_EQ(data.nonzeros[5], tpl(1, 2, 3.5)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{2.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 0, value_type{1.5})); + EXPECT_EQ(data.nonzeros[4], tpl(1, 1, value_type{2.5})); + EXPECT_EQ(data.nonzeros[5], tpl(1, 2, value_type{3.5})); } -TEST_F(Dense, CanCreateSubmatrix) +TYPED_TEST(Dense, CanCreateSubmatrix) { - auto submtx = mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2}); + using value_type = typename TestFixture::value_type; + auto submtx = this->mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2}); - EXPECT_EQ(submtx->at(0, 0), 2.0); - EXPECT_EQ(submtx->at(0, 1), 3.0); - EXPECT_EQ(submtx->at(1, 0), 2.5); - EXPECT_EQ(submtx->at(1, 1), 3.5); + EXPECT_EQ(submtx->at(0, 0), value_type{2.0}); + EXPECT_EQ(submtx->at(0, 1), value_type{3.0}); + EXPECT_EQ(submtx->at(1, 0), value_type{2.5}); + EXPECT_EQ(submtx->at(1, 1), value_type{3.5}); } -TEST_F(Dense, CanCreateSubmatrixWithStride) +TYPED_TEST(Dense, CanCreateSubmatrixWithStride) { - auto submtx = mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2}, 3); - - EXPECT_EQ(submtx->at(0, 0), 2.0); - EXPECT_EQ(submtx->at(0, 1), 3.0); - EXPECT_EQ(submtx->at(1, 0), 1.5); - EXPECT_EQ(submtx->at(1, 1), 2.5); + using value_type = typename TestFixture::value_type; + auto submtx = + this->mtx->create_submatrix(gko::span{0, 1}, gko::span{1, 2}, 3); + + EXPECT_EQ(submtx->at(0, 0), value_type{2.0}); + EXPECT_EQ(submtx->at(0, 1), value_type{3.0}); + EXPECT_EQ(submtx->at(1, 0), value_type{1.5}); + EXPECT_EQ(submtx->at(1, 1), value_type{2.5}); } diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp index fdefa61712e..6e92f1251ba 100644 --- a/core/test/matrix/ell.cpp +++ b/core/test/matrix/ell.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,19 +36,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Ell : public ::testing::Test { protected: - using Mtx = gko::matrix::Ell<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Ell; Ell() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::Ell<>::create(exec, gko::dim<2>{2, 3}, 3)) + mtx(gko::matrix::Ell::create( + exec, gko::dim<2>{2, 3}, 3)) { - Mtx::value_type *v = mtx->get_values(); - Mtx::index_type *c = mtx->get_col_idxs(); + value_type *v = mtx->get_values(); + index_type *c = mtx->get_col_idxs(); c[0] = 0; c[1] = 1; c[2] = 1; @@ -82,12 +91,12 @@ class Ell : public ::testing::Test { EXPECT_EQ(c[3], 0); EXPECT_EQ(c[4], 2); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 3.0); - EXPECT_EQ(v[3], 0.0); - EXPECT_EQ(v[4], 2.0); - EXPECT_EQ(v[5], 0.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); + EXPECT_EQ(v[2], value_type{3.0}); + EXPECT_EQ(v[3], value_type{0.0}); + EXPECT_EQ(v[4], value_type{2.0}); + EXPECT_EQ(v[5], value_type{0.0}); } void assert_empty(const Mtx *m) @@ -101,84 +110,97 @@ class Ell : public ::testing::Test { } }; +TYPED_TEST_CASE(Ell, gko::test::ValueIndexTypes); + -TEST_F(Ell, KnowsItsSize) +TYPED_TEST(Ell, KnowsItsSize) { - ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(mtx->get_num_stored_elements(), 6); - ASSERT_EQ(mtx->get_num_stored_elements_per_row(), 3); - ASSERT_EQ(mtx->get_stride(), 2); + ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx->get_num_stored_elements(), 6); + ASSERT_EQ(this->mtx->get_num_stored_elements_per_row(), 3); + ASSERT_EQ(this->mtx->get_stride(), 2); } -TEST_F(Ell, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); } +TYPED_TEST(Ell, ContainsCorrectData) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} -TEST_F(Ell, CanBeEmpty) +TYPED_TEST(Ell, CanBeEmpty) { - auto mtx = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec); - assert_empty(mtx.get()); + this->assert_empty(mtx.get()); } -TEST_F(Ell, CanBeCreatedFromExistingData) +TYPED_TEST(Ell, CanBeCreatedFromExistingData) { - double values[] = {1.0, 3.0, 4.0, -1.0, 2.0, 0.0, 0.0, -1.0}; - gko::int32 col_idxs[] = {0, 1, 0, -1, 1, 0, 0, -1}; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + value_type values[] = {1.0, 3.0, 4.0, -1.0, 2.0, 0.0, 0.0, -1.0}; + index_type col_idxs[] = {0, 1, 0, -1, 1, 0, 0, -1}; - auto mtx = gko::matrix::Ell<>::create( - exec, gko::dim<2>{3, 2}, gko::Array::view(exec, 8, values), - gko::Array::view(exec, 8, col_idxs), 2, 4); + auto mtx = gko::matrix::Ell::create( + this->exec, gko::dim<2>{3, 2}, + gko::Array::view(this->exec, 8, values), + gko::Array::view(this->exec, 8, col_idxs), 2, 4); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); } -TEST_F(Ell, CanBeCopied) +TYPED_TEST(Ell, CanBeCopied) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(mtx.get()); + copy->copy_from(this->mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Ell, CanBeMoved) +TYPED_TEST(Ell, CanBeMoved) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(std::move(mtx)); + copy->copy_from(std::move(this->mtx)); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Ell, CanBeCloned) +TYPED_TEST(Ell, CanBeCloned) { - auto clone = mtx->clone(); + using Mtx = typename TestFixture::Mtx; + auto clone = this->mtx->clone(); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(static_cast(clone.get())); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(static_cast(clone.get())); } -TEST_F(Ell, CanBeCleared) +TYPED_TEST(Ell, CanBeCleared) { - mtx->clear(); + this->mtx->clear(); - assert_empty(mtx.get()); + this->assert_empty(this->mtx.get()); } -TEST_F(Ell, CanBeReadFromMatrixData) +TYPED_TEST(Ell, CanBeReadFromMatrixData) { - auto m = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -187,23 +209,25 @@ TEST_F(Ell, CanBeReadFromMatrixData) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_mtx(m.get()); } -TEST_F(Ell, GeneratesCorrectMatrixData) +TYPED_TEST(Ell, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 4); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0})); } diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp index 3a8ca343d10..dac9da86167 100644 --- a/core/test/matrix/hybrid.cpp +++ b/core/test/matrix/hybrid.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,19 +36,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Hybrid : public ::testing::Test { protected: - using Mtx = gko::matrix::Hybrid<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Hybrid; Hybrid() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::Hybrid<>::create(exec, gko::dim<2>{2, 3}, 2, 2, 1)) + mtx(gko::matrix::Hybrid::create( + exec, gko::dim<2>{2, 3}, 2, 2, 1)) { - Mtx::value_type *v = mtx->get_ell_values(); - Mtx::index_type *c = mtx->get_ell_col_idxs(); + value_type *v = mtx->get_ell_values(); + index_type *c = mtx->get_ell_col_idxs(); c[0] = 0; c[1] = 1; c[2] = 1; @@ -80,11 +89,11 @@ class Hybrid : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 1); EXPECT_EQ(c[3], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 3.0); - EXPECT_EQ(v[3], 0.0); - EXPECT_EQ(m->get_const_coo_values()[0], 2.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); + EXPECT_EQ(v[2], value_type{3.0}); + EXPECT_EQ(v[3], value_type{0.0}); + EXPECT_EQ(m->get_const_coo_values()[0], value_type{2.0}); EXPECT_EQ(m->get_const_coo_col_idxs()[0], 2); EXPECT_EQ(m->get_const_coo_row_idxs()[0], 0); } @@ -103,71 +112,83 @@ class Hybrid : public ::testing::Test { } }; +TYPED_TEST_CASE(Hybrid, gko::test::ValueIndexTypes); + -TEST_F(Hybrid, KnowsItsSize) +TYPED_TEST(Hybrid, KnowsItsSize) { - ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(mtx->get_ell_num_stored_elements(), 4); - ASSERT_EQ(mtx->get_ell_num_stored_elements_per_row(), 2); - ASSERT_EQ(mtx->get_ell_stride(), 2); - ASSERT_EQ(mtx->get_coo_num_stored_elements(), 1); + ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx->get_ell_num_stored_elements(), 4); + ASSERT_EQ(this->mtx->get_ell_num_stored_elements_per_row(), 2); + ASSERT_EQ(this->mtx->get_ell_stride(), 2); + ASSERT_EQ(this->mtx->get_coo_num_stored_elements(), 1); } -TEST_F(Hybrid, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); } +TYPED_TEST(Hybrid, ContainsCorrectData) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} -TEST_F(Hybrid, CanBeEmpty) +TYPED_TEST(Hybrid, CanBeEmpty) { - auto mtx = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec); - assert_empty(mtx.get()); + this->assert_empty(mtx.get()); } -TEST_F(Hybrid, CanBeCopied) +TYPED_TEST(Hybrid, CanBeCopied) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(mtx.get()); + copy->copy_from(this->mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_ell_values()[1] = 5.0; - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_ell_values()[1] = 5.0; + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Hybrid, CanBeMoved) +TYPED_TEST(Hybrid, CanBeMoved) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(std::move(mtx)); + copy->copy_from(std::move(this->mtx)); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Hybrid, CanBeCloned) +TYPED_TEST(Hybrid, CanBeCloned) { - auto clone = mtx->clone(); + using Mtx = typename TestFixture::Mtx; + auto clone = this->mtx->clone(); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_ell_values()[1] = 5.0; - assert_equal_to_original_mtx(static_cast(clone.get())); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_ell_values()[1] = 5.0; + this->assert_equal_to_original_mtx(static_cast(clone.get())); } -TEST_F(Hybrid, CanBeCleared) +TYPED_TEST(Hybrid, CanBeCleared) { - mtx->clear(); + this->mtx->clear(); - assert_empty(mtx.get()); + this->assert_empty(this->mtx.get()); } -TEST_F(Hybrid, CanBeReadFromMatrixDataAutomatically) +TYPED_TEST(Hybrid, CanBeReadFromMatrixDataAutomatically) { - auto m = Mtx::create(exec, std::make_shared()); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto m = + Mtx::create(this->exec, std::make_shared()); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -194,16 +215,18 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataAutomatically) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{3.0}); + EXPECT_EQ(v[2], value_type{2.0}); + EXPECT_EQ(v[3], value_type{5.0}); } -TEST_F(Hybrid, CanBeReadFromMatrixDataByColumns2) +TYPED_TEST(Hybrid, CanBeReadFromMatrixDataByColumns2) { - auto m = Mtx::create(exec, std::make_shared(2)); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec, + std::make_shared(2)); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -212,13 +235,16 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataByColumns2) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_mtx(m.get()); } -TEST_F(Hybrid, CanBeReadFromMatrixDataByPercent40) +TYPED_TEST(Hybrid, CanBeReadFromMatrixDataByPercent40) { - auto m = Mtx::create(exec, std::make_shared(0.4)); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto m = Mtx::create(this->exec, + std::make_shared(0.4)); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -237,15 +263,15 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataByPercent40) EXPECT_EQ(p, 2); EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); auto coo_v = m->get_const_coo_values(); auto coo_c = m->get_const_coo_col_idxs(); auto coo_r = m->get_const_coo_row_idxs(); ASSERT_EQ(m->get_coo_num_stored_elements(), 2); - EXPECT_EQ(coo_v[0], 3.0); - EXPECT_EQ(coo_v[1], 2.0); + EXPECT_EQ(coo_v[0], value_type{3.0}); + EXPECT_EQ(coo_v[1], value_type{2.0}); EXPECT_EQ(coo_c[0], 1); EXPECT_EQ(coo_c[1], 2); EXPECT_EQ(coo_r[0], 0); @@ -253,19 +279,21 @@ TEST_F(Hybrid, CanBeReadFromMatrixDataByPercent40) } -TEST_F(Hybrid, GeneratesCorrectMatrixData) +TYPED_TEST(Hybrid, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 4); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0})); } diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp index 58a08ac37d6..f890a9dd039 100644 --- a/core/test/matrix/identity.cpp +++ b/core/test/matrix/identity.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,43 +36,53 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include "core/test/utils.hpp" + + namespace { +template class Identity : public ::testing::Test { protected: - using Id = gko::matrix::Identity<>; - using Vec = gko::matrix::Dense<>; + using value_type = T; + using Id = gko::matrix::Identity; + using Vec = gko::matrix::Dense; Identity() : exec(gko::ReferenceExecutor::create()) {} std::shared_ptr exec; }; +TYPED_TEST_CASE(Identity, gko::test::ValueTypes); -TEST_F(Identity, CanBeEmpty) + +TYPED_TEST(Identity, CanBeEmpty) { - auto empty = Id::create(exec); + using Id = typename TestFixture::Id; + auto empty = Id::create(this->exec); ASSERT_EQ(empty->get_size(), gko::dim<2>(0, 0)); } -TEST_F(Identity, CanBeConstructedWithSize) +TYPED_TEST(Identity, CanBeConstructedWithSize) { - auto identity = Id::create(exec, 5); + using Id = typename TestFixture::Id; + auto identity = Id::create(this->exec, 5); ASSERT_EQ(identity->get_size(), gko::dim<2>(5, 5)); } -TEST_F(Identity, AppliesToVector) +TYPED_TEST(Identity, AppliesToVector) { - auto identity = Id::create(exec, 3); - auto x = Vec::create(exec, gko::dim<2>{3, 1}); - auto b = gko::initialize({2.0, 1.0, 5.0}, exec); + using Id = typename TestFixture::Id; + using Vec = typename TestFixture::Vec; + auto identity = Id::create(this->exec, 3); + auto x = Vec::create(this->exec, gko::dim<2>{3, 1}); + auto b = gko::initialize({2.0, 1.0, 5.0}, this->exec); identity->apply(b.get(), x.get()); @@ -80,12 +90,15 @@ TEST_F(Identity, AppliesToVector) } -TEST_F(Identity, AppliesToMultipleVectors) +TYPED_TEST(Identity, AppliesToMultipleVectors) { - auto identity = Id::create(exec, 3); - auto x = Vec::create(exec, gko::dim<2>{3, 2}, 3); - auto b = - gko::initialize(3, {{2.0, 3.0}, {1.0, 2.0}, {5.0, -1.0}}, exec); + using Id = typename TestFixture::Id; + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto identity = Id::create(this->exec, 3); + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}, 3); + auto b = gko::initialize( + 3, {I{2.0, 3.0}, I{1.0, 2.0}, I{5.0, -1.0}}, this->exec); identity->apply(b.get(), x.get()); @@ -93,11 +106,20 @@ TEST_F(Identity, AppliesToMultipleVectors) } -TEST(IdentityFactory, CanGenerateIdentityMatrix) +template +class IdentityFactory : public ::testing::Test { +protected: + using value_type = T; +}; + +TYPED_TEST_CASE(IdentityFactory, gko::test::ValueTypes); + + +TYPED_TEST(IdentityFactory, CanGenerateIdentityMatrix) { auto exec = gko::ReferenceExecutor::create(); - auto id_factory = gko::matrix::IdentityFactory<>::create(exec); - auto mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{5, 5}); + auto id_factory = gko::matrix::IdentityFactory::create(exec); + auto mtx = gko::matrix::Dense::create(exec, gko::dim<2>{5, 5}); auto id = id_factory->generate(std::move(mtx)); diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp new file mode 100644 index 00000000000..c64f39fb3e2 --- /dev/null +++ b/core/test/matrix/permutation.cpp @@ -0,0 +1,294 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class Permutation : public ::testing::Test { +protected: + using v_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using i_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Vec = gko::matrix::Dense; + using Csr = gko::matrix::Csr; + Permutation() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::matrix::Permutation::create( + exec, gko::dim<2>{4, 3}, gko::Array{exec, {1, 0, 2, 3}})) + {} + + + static void assert_equal_to_original_mtx( + gko::matrix::Permutation *m) + { + auto perm = m->get_permutation(); + ASSERT_EQ(m->get_size(), gko::dim<2>(4, 3)); + ASSERT_EQ(m->get_permutation_size(), 4); + ASSERT_EQ(perm[0], 1); + ASSERT_EQ(perm[1], 0); + ASSERT_EQ(perm[2], 2); + ASSERT_EQ(perm[3], 3); + } + + static void assert_empty(gko::matrix::Permutation *m) + { + ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0)); + ASSERT_EQ(m->get_permutation_size(), 0); + } + + std::shared_ptr exec; + std::unique_ptr> mtx; +}; + +TYPED_TEST_CASE(Permutation, gko::test::ValueIndexTypes); + + +TYPED_TEST(Permutation, CanBeEmpty) +{ + using i_type = typename TestFixture::i_type; + auto empty = gko::matrix::Permutation::create(this->exec); + + this->assert_empty(empty.get()); +} + + +TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty) +{ + using i_type = typename TestFixture::i_type; + auto empty = gko::matrix::Permutation::create(this->exec); + + ASSERT_EQ(empty->get_const_permutation(), nullptr); +} + + +TYPED_TEST(Permutation, CanBeConstructedWithSize) +{ + using i_type = typename TestFixture::i_type; + auto m = + gko::matrix::Permutation::create(this->exec, gko::dim<2>{2, 3}); + + ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_permutation_size(), 2); +} + + +TYPED_TEST(Permutation, FactorySetsCorrectPermuteMask) +{ + using i_type = typename TestFixture::i_type; + auto m = gko::matrix::Permutation::create(this->exec); + auto mask = m->get_permute_mask(); + + ASSERT_EQ(mask, gko::matrix::row_permute); +} + + +TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData) +{ + using i_type = typename TestFixture::i_type; + using i_type = typename TestFixture::i_type; + i_type data[] = {1, 0, 2}; + + auto m = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3, 5}, + gko::Array::view(this->exec, 3, data)); + + ASSERT_EQ(m->get_const_permutation(), data); +} + + +TYPED_TEST(Permutation, CanBeConstructedWithSizeAndMask) +{ + using i_type = typename TestFixture::i_type; + auto m = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute); + + ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_permutation_size(), 2); + ASSERT_EQ(m->get_permute_mask(), gko::matrix::column_permute); +} + + +TYPED_TEST(Permutation, CanExplicitlyOverrideSetPermuteMask) +{ + using i_type = typename TestFixture::i_type; + auto m = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute); + + auto mask = m->get_permute_mask(); + ASSERT_EQ(mask, gko::matrix::column_permute); + + m->set_permute_mask(gko::matrix::row_permute | + gko::matrix::inverse_permute); + + auto s_mask = m->get_permute_mask(); + ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute); +} + + +TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions) +{ + using i_type = typename TestFixture::i_type; + i_type data[] = {0, 2, 1}; + + ASSERT_THROW(gko::matrix::Permutation::create( + this->exec, gko::dim<2>{4, 2}, + gko::Array::view(this->exec, 3, data)), + gko::ValueMismatch); +} + + +TYPED_TEST(Permutation, SettingMaskDoesNotModifyData) +{ + using i_type = typename TestFixture::i_type; + i_type data[] = {1, 0, 2}; + + auto m = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3, 5}, + gko::Array::view(this->exec, 3, data)); + + auto mask = m->get_permute_mask(); + ASSERT_EQ(m->get_const_permutation(), data); + ASSERT_EQ(mask, gko::matrix::row_permute); + + m->set_permute_mask(gko::matrix::row_permute | + gko::matrix::inverse_permute); + + auto s_mask = m->get_permute_mask(); + ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute); + ASSERT_EQ(m->get_const_permutation(), data); +} + + +TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions) +{ + using i_type = typename TestFixture::i_type; + i_type data[] = {0, 2, 1}; + + ASSERT_THROW(gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3, 4}, + gko::Array::view(this->exec, 3, data), + gko::matrix::column_permute), + gko::ValueMismatch); +} + + +TYPED_TEST(Permutation, KnowsItsSizeAndValues) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} + + +TYPED_TEST(Permutation, CanBeCopied) +{ + using i_type = typename TestFixture::i_type; + auto mtx_copy = gko::matrix::Permutation::create(this->exec); + + mtx_copy->copy_from(this->mtx.get()); + + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_permutation()[0] = 3; + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(Permutation, CanBeMoved) +{ + using i_type = typename TestFixture::i_type; + auto mtx_copy = gko::matrix::Permutation::create(this->exec); + + mtx_copy->copy_from(std::move(this->mtx)); + + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(Permutation, CopyingPreservesMask) +{ + using i_type = typename TestFixture::i_type; + auto mtx_copy = gko::matrix::Permutation::create(this->exec); + + mtx_copy->copy_from(this->mtx.get()); + + auto o_mask = this->mtx->get_permute_mask(); + auto n_mask = mtx_copy->get_permute_mask(); + ASSERT_EQ(o_mask, gko::matrix::row_permute); + ASSERT_EQ(o_mask, n_mask); + + this->mtx->set_permute_mask(gko::matrix::column_permute); + + o_mask = this->mtx->get_permute_mask(); + n_mask = mtx_copy->get_permute_mask(); + ASSERT_EQ(o_mask, gko::matrix::column_permute); + ASSERT_NE(o_mask, n_mask); + + mtx_copy->copy_from(this->mtx.get()); + + n_mask = mtx_copy->get_permute_mask(); + ASSERT_EQ(o_mask, n_mask); +} + + +TYPED_TEST(Permutation, CanBeCloned) +{ + auto mtx_clone = this->mtx->clone(); + + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); +} + + +TYPED_TEST(Permutation, CanBeCleared) +{ + this->mtx->clear(); + + this->assert_empty(this->mtx.get()); +} + + +} // namespace diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp index bc3d790ee34..d6f139ba82e 100644 --- a/core/test/matrix/sellp.cpp +++ b/core/test/matrix/sellp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,16 +36,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Sellp : public ::testing::Test { protected: - using Mtx = gko::matrix::Sellp<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Sellp; Sellp() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::Sellp<>::create(exec, gko::dim<2>{2, 3}, 3)) + mtx(gko::matrix::Sellp::create( + exec, gko::dim<2>{2, 3}, 3)) { mtx->read( {{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, {0, 2, 2.0}, {1, 1, 5.0}}}); @@ -77,12 +86,12 @@ class Sellp : public ::testing::Test { EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0); EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[gko::matrix::default_slice_size], 3.0); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], 0.0); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], 2.0); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], 0.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size], value_type{3.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], value_type{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], value_type{2.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], value_type{0.0}); } void assert_equal_to_original_mtx_with_slice_size_and_stride_factor( @@ -109,12 +118,12 @@ class Sellp : public ::testing::Test { EXPECT_EQ(c[3], 0); EXPECT_EQ(c[4], 2); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 3.0); - EXPECT_EQ(v[3], 0.0); - EXPECT_EQ(v[4], 2.0); - EXPECT_EQ(v[5], 0.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); + EXPECT_EQ(v[2], value_type{3.0}); + EXPECT_EQ(v[3], value_type{0.0}); + EXPECT_EQ(v[4], value_type{2.0}); + EXPECT_EQ(v[5], value_type{0.0}); } void assert_empty(const Mtx *m) @@ -125,34 +134,43 @@ class Sellp : public ::testing::Test { ASSERT_EQ(m->get_const_values(), nullptr); ASSERT_EQ(m->get_const_col_idxs(), nullptr); ASSERT_EQ(m->get_const_slice_lengths(), nullptr); - ASSERT_EQ(m->get_const_slice_sets(), nullptr); + ASSERT_NE(m->get_const_slice_sets(), nullptr); } }; +TYPED_TEST_CASE(Sellp, gko::test::ValueIndexTypes); + -TEST_F(Sellp, KnowsItsSize) +TYPED_TEST(Sellp, KnowsItsSize) { - ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(mtx->get_num_stored_elements(), 192); - ASSERT_EQ(mtx->get_slice_size(), gko::matrix::default_slice_size); - ASSERT_EQ(mtx->get_stride_factor(), gko::matrix::default_stride_factor); - ASSERT_EQ(mtx->get_total_cols(), 3); + ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx->get_num_stored_elements(), 192); + ASSERT_EQ(this->mtx->get_slice_size(), gko::matrix::default_slice_size); + ASSERT_EQ(this->mtx->get_stride_factor(), + gko::matrix::default_stride_factor); + ASSERT_EQ(this->mtx->get_total_cols(), 3); } -TEST_F(Sellp, ContainsCorrectData) { assert_equal_to_original_mtx(mtx.get()); } +TYPED_TEST(Sellp, ContainsCorrectData) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} -TEST_F(Sellp, CanBeEmpty) +TYPED_TEST(Sellp, CanBeEmpty) { - auto mtx = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec); - assert_empty(mtx.get()); + this->assert_empty(mtx.get()); } -TEST_F(Sellp, CanBeConstructedWithSliceSizeAndStrideFactor) + +TYPED_TEST(Sellp, CanBeConstructedWithSliceSizeAndStrideFactor) { - auto mtx = Mtx::create(exec, gko::dim<2>{2, 3}, 2, 2, 3); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec, gko::dim<2>{2, 3}, 2, 2, 3); ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(mtx->get_num_stored_elements(), 6); @@ -162,49 +180,53 @@ TEST_F(Sellp, CanBeConstructedWithSliceSizeAndStrideFactor) } -TEST_F(Sellp, CanBeCopied) +TYPED_TEST(Sellp, CanBeCopied) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(mtx.get()); + copy->copy_from(this->mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Sellp, CanBeMoved) +TYPED_TEST(Sellp, CanBeMoved) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(std::move(mtx)); + copy->copy_from(std::move(this->mtx)); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(Sellp, CanBeCloned) +TYPED_TEST(Sellp, CanBeCloned) { - auto clone = mtx->clone(); + using Mtx = typename TestFixture::Mtx; + auto clone = this->mtx->clone(); - assert_equal_to_original_mtx(mtx.get()); - mtx->get_values()[1] = 5.0; - assert_equal_to_original_mtx(dynamic_cast(clone.get())); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->get_values()[1] = 5.0; + this->assert_equal_to_original_mtx(dynamic_cast(clone.get())); } -TEST_F(Sellp, CanBeCleared) +TYPED_TEST(Sellp, CanBeCleared) { - mtx->clear(); + this->mtx->clear(); - assert_empty(mtx.get()); + this->assert_empty(this->mtx.get()); } -TEST_F(Sellp, CanBeReadFromMatrixData) +TYPED_TEST(Sellp, CanBeReadFromMatrixData) { - auto m = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -213,12 +235,14 @@ TEST_F(Sellp, CanBeReadFromMatrixData) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_mtx(m.get()); } -TEST_F(Sellp, CanBeReadFromMatrixDataWithSliceSizeAndStrideFactor) + +TYPED_TEST(Sellp, CanBeReadFromMatrixDataWithSliceSizeAndStrideFactor) { - auto m = Mtx::create(exec, gko::dim<2>{2, 3}, 2, 2, 3); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec, gko::dim<2>{2, 3}, 2, 2, 3); m->read({{2, 3}, {{0, 0, 1.0}, {0, 1, 3.0}, @@ -227,22 +251,26 @@ TEST_F(Sellp, CanBeReadFromMatrixDataWithSliceSizeAndStrideFactor) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx_with_slice_size_and_stride_factor(m.get()); + this->assert_equal_to_original_mtx_with_slice_size_and_stride_factor( + m.get()); } -TEST_F(Sellp, GeneratesCorrectMatrixData) + +TYPED_TEST(Sellp, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 4); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 3.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 2.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 5.0)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{3.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{2.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{5.0})); } diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp index 6d11271e2d3..7e26fee9c88 100644 --- a/core/test/matrix/sparsity_csr.cpp +++ b/core/test/matrix/sparsity_csr.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,19 +43,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class SparsityCsr : public ::testing::Test { protected: - using Mtx = gko::matrix::SparsityCsr<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::SparsityCsr; SparsityCsr() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::SparsityCsr<>::create(exec, gko::dim<2>{2, 3}, 4)) + mtx(gko::matrix::SparsityCsr::create( + exec, gko::dim<2>{2, 3}, 4)) { - Mtx::index_type *c = mtx->get_col_idxs(); - Mtx::index_type *r = mtx->get_row_ptrs(); + index_type *c = mtx->get_col_idxs(); + index_type *r = mtx->get_row_ptrs(); r[0] = 0; r[1] = 3; r[2] = 4; @@ -82,7 +91,7 @@ class SparsityCsr : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); + EXPECT_EQ(v[0], value_type{1.0}); } void assert_empty(Mtx *m) @@ -90,106 +99,117 @@ class SparsityCsr : public ::testing::Test { ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0)); ASSERT_EQ(m->get_num_nonzeros(), 0); ASSERT_EQ(m->get_const_col_idxs(), nullptr); - ASSERT_EQ(m->get_const_row_ptrs(), nullptr); - ASSERT_EQ(m->get_const_value(), nullptr); + ASSERT_NE(m->get_const_row_ptrs(), nullptr); + ASSERT_NE(m->get_const_value(), nullptr); ASSERT_EQ(m->get_col_idxs(), nullptr); - ASSERT_EQ(m->get_row_ptrs(), nullptr); - ASSERT_EQ(m->get_value(), nullptr); + ASSERT_NE(m->get_row_ptrs(), nullptr); + ASSERT_NE(m->get_value(), nullptr); } }; +TYPED_TEST_CASE(SparsityCsr, gko::test::ValueIndexTypes); + -TEST_F(SparsityCsr, KnowsItsSize) +TYPED_TEST(SparsityCsr, KnowsItsSize) { - ASSERT_EQ(mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(mtx->get_num_nonzeros(), 4); + ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx->get_num_nonzeros(), 4); } -TEST_F(SparsityCsr, ContainsCorrectData) +TYPED_TEST(SparsityCsr, ContainsCorrectData) { - assert_equal_to_original_mtx(mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); } -TEST_F(SparsityCsr, CanBeEmpty) +TYPED_TEST(SparsityCsr, CanBeEmpty) { - auto mtx = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec); - assert_empty(mtx.get()); + this->assert_empty(mtx.get()); } -TEST_F(SparsityCsr, SetsCorrectDefaultValue) +TYPED_TEST(SparsityCsr, SetsCorrectDefaultValue) { - auto mtx = gko::matrix::SparsityCsr<>::create( - exec, gko::dim<2>{3, 2}, static_cast(0)); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto mtx = gko::matrix::SparsityCsr::create( + this->exec, gko::dim<2>{3, 2}, static_cast(0)); - ASSERT_EQ(mtx->get_const_value()[0], 1.0); - ASSERT_EQ(mtx->get_value()[0], 1.0); + ASSERT_EQ(mtx->get_const_value()[0], value_type{1.0}); + ASSERT_EQ(mtx->get_value()[0], value_type{1.0}); } -TEST_F(SparsityCsr, CanBeCreatedFromExistingData) +TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingData) { - gko::int32 col_idxs[] = {0, 1, 1, 0}; - gko::int32 row_ptrs[] = {0, 2, 3, 4}; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + index_type col_idxs[] = {0, 1, 1, 0}; + index_type row_ptrs[] = {0, 2, 3, 4}; - auto mtx = gko::matrix::SparsityCsr<>::create( - exec, gko::dim<2>{3, 2}, - gko::Array::view(exec, 4, col_idxs), - gko::Array::view(exec, 4, row_ptrs), 2.0); + auto mtx = gko::matrix::SparsityCsr::create( + this->exec, gko::dim<2>{3, 2}, + gko::Array::view(this->exec, 4, col_idxs), + gko::Array::view(this->exec, 4, row_ptrs), 2.0); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); ASSERT_EQ(mtx->get_const_row_ptrs(), row_ptrs); - ASSERT_EQ(mtx->get_const_value()[0], 2.0); + ASSERT_EQ(mtx->get_const_value()[0], value_type{2.0}); ASSERT_EQ(mtx->get_col_idxs(), col_idxs); ASSERT_EQ(mtx->get_row_ptrs(), row_ptrs); - ASSERT_EQ(mtx->get_value()[0], 2.0); + ASSERT_EQ(mtx->get_value()[0], value_type{2.0}); } -TEST_F(SparsityCsr, CanBeCopied) +TYPED_TEST(SparsityCsr, CanBeCopied) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(mtx.get()); + copy->copy_from(this->mtx.get()); - assert_equal_to_original_mtx(mtx.get()); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(SparsityCsr, CanBeMoved) +TYPED_TEST(SparsityCsr, CanBeMoved) { - auto copy = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto copy = Mtx::create(this->exec); - copy->copy_from(std::move(mtx)); + copy->copy_from(std::move(this->mtx)); - assert_equal_to_original_mtx(copy.get()); + this->assert_equal_to_original_mtx(copy.get()); } -TEST_F(SparsityCsr, CanBeCloned) +TYPED_TEST(SparsityCsr, CanBeCloned) { - auto clone = mtx->clone(); + using Mtx = typename TestFixture::Mtx; + auto clone = this->mtx->clone(); - assert_equal_to_original_mtx(mtx.get()); - assert_equal_to_original_mtx(dynamic_cast(clone.get())); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->assert_equal_to_original_mtx(dynamic_cast(clone.get())); } -TEST_F(SparsityCsr, CanBeCleared) +TYPED_TEST(SparsityCsr, CanBeCleared) { - mtx->clear(); + this->mtx->clear(); - assert_empty(mtx.get()); + this->assert_empty(this->mtx.get()); } -TEST_F(SparsityCsr, CanBeReadFromMatrixData) +TYPED_TEST(SparsityCsr, CanBeReadFromMatrixData) { - auto m = Mtx::create(exec); + using Mtx = typename TestFixture::Mtx; + auto m = Mtx::create(this->exec); m->read({{2, 3}, {{0, 0, 1.0}, @@ -199,23 +219,25 @@ TEST_F(SparsityCsr, CanBeReadFromMatrixData) {1, 1, 5.0}, {1, 2, 0.0}}}); - assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_mtx(m.get()); } -TEST_F(SparsityCsr, GeneratesCorrectMatrixData) +TYPED_TEST(SparsityCsr, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; - mtx->write(data); + this->mtx->write(data); ASSERT_EQ(data.size, gko::dim<2>(2, 3)); ASSERT_EQ(data.nonzeros.size(), 4); - EXPECT_EQ(data.nonzeros[0], tpl(0, 0, 1.0)); - EXPECT_EQ(data.nonzeros[1], tpl(0, 1, 1.0)); - EXPECT_EQ(data.nonzeros[2], tpl(0, 2, 1.0)); - EXPECT_EQ(data.nonzeros[3], tpl(1, 1, 1.0)); + EXPECT_EQ(data.nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data.nonzeros[1], tpl(0, 1, value_type{1.0})); + EXPECT_EQ(data.nonzeros[2], tpl(0, 2, value_type{1.0})); + EXPECT_EQ(data.nonzeros[3], tpl(1, 1, value_type{1.0})); } diff --git a/core/test/preconditioner/CMakeLists.txt b/core/test/preconditioner/CMakeLists.txt index 82eec1105ca..efbeed1af2e 100644 --- a/core/test/preconditioner/CMakeLists.txt +++ b/core/test/preconditioner/CMakeLists.txt @@ -1,2 +1,3 @@ ginkgo_create_test(ilu) +ginkgo_create_test(isai) ginkgo_create_test(jacobi) diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp index 0bfa2392833..2103e00d958 100644 --- a/core/test/preconditioner/ilu.cpp +++ b/core/test/preconditioner/ilu.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,15 +43,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class IluFactory : public ::testing::Test { protected: - using value_type = double; + using value_type = T; using l_solver_type = gko::solver::Bicgstab; using u_solver_type = gko::solver::Bicgstab; using ilu_prec_type = @@ -64,34 +65,41 @@ class IluFactory : public ::testing::Test { {} std::shared_ptr exec; - std::shared_ptr l_factory; - std::shared_ptr u_factory; + std::shared_ptr l_factory; + std::shared_ptr u_factory; }; +TYPED_TEST_CASE(IluFactory, gko::test::ValueTypes); -TEST_F(IluFactory, KnowsItsExecutor) + +TYPED_TEST(IluFactory, KnowsItsExecutor) { - auto ilu_factory = ilu_prec_type::build().on(exec); + using ilu_prec_type = typename TestFixture::ilu_prec_type; + auto ilu_factory = ilu_prec_type::build().on(this->exec); - ASSERT_EQ(ilu_factory->get_executor(), exec); + ASSERT_EQ(ilu_factory->get_executor(), this->exec); } -TEST_F(IluFactory, CanSetLSolverFactory) +TYPED_TEST(IluFactory, CanSetLSolverFactory) { - auto ilu_factory = - ilu_prec_type::build().with_l_solver_factory(l_factory).on(exec); + using ilu_prec_type = typename TestFixture::ilu_prec_type; + auto ilu_factory = ilu_prec_type::build() + .with_l_solver_factory(this->l_factory) + .on(this->exec); - ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, l_factory); + ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory); } -TEST_F(IluFactory, CanSetUSolverFactory) +TYPED_TEST(IluFactory, CanSetUSolverFactory) { - auto ilu_factory = - ilu_prec_type::build().with_u_solver_factory(u_factory).on(exec); + using ilu_prec_type = typename TestFixture::ilu_prec_type; + auto ilu_factory = ilu_prec_type::build() + .with_u_solver_factory(this->u_factory) + .on(this->exec); - ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, u_factory); + ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, this->u_factory); } diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp new file mode 100644 index 00000000000..1a549bca874 --- /dev/null +++ b/core/test/preconditioner/isai.cpp @@ -0,0 +1,183 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +struct DummyOperator : public gko::EnableLinOp, + gko::EnableCreateMethod { + DummyOperator(std::shared_ptr exec, + gko::dim<2> size = {}) + : gko::EnableLinOp(exec, size) + {} + + void apply_impl(const LinOp *b, LinOp *x) const override {} + + void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, + LinOp *x) const override + {} +}; + + +template +class IsaiFactory : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using LowerIsai = gko::preconditioner::LowerIsai; + using UpperIsai = gko::preconditioner::UpperIsai; + using Csr = gko::matrix::Csr; + + IsaiFactory() + : exec(gko::ReferenceExecutor::create()), + lower_isai_factory(LowerIsai::build().on(exec)), + upper_isai_factory(UpperIsai::build().on(exec)) + {} + + std::shared_ptr exec; + std::unique_ptr lower_isai_factory; + std::unique_ptr upper_isai_factory; +}; + +TYPED_TEST_CASE(IsaiFactory, gko::test::ValueIndexTypes); + + +TYPED_TEST(IsaiFactory, KnowsItsExecutor) +{ + ASSERT_EQ(this->lower_isai_factory->get_executor(), this->exec); + ASSERT_EQ(this->upper_isai_factory->get_executor(), this->exec); +} + + +TYPED_TEST(IsaiFactory, SetsSkipSortingCorrectly) +{ + using LowerIsai = typename TestFixture::LowerIsai; + using UpperIsai = typename TestFixture::UpperIsai; + + auto l_isai_factory = + LowerIsai::build().with_skip_sorting(true).on(this->exec); + auto u_isai_factory = + UpperIsai::build().with_skip_sorting(true).on(this->exec); + + ASSERT_EQ(l_isai_factory->get_parameters().skip_sorting, true); + ASSERT_EQ(u_isai_factory->get_parameters().skip_sorting, true); +} + + +TYPED_TEST(IsaiFactory, SetsDefaultSkipSortingCorrectly) +{ + ASSERT_EQ(this->lower_isai_factory->get_parameters().skip_sorting, false); + ASSERT_EQ(this->upper_isai_factory->get_parameters().skip_sorting, false); +} + + +TYPED_TEST(IsaiFactory, SetsSparsityPowerCorrectly) +{ + using LowerIsai = typename TestFixture::LowerIsai; + using UpperIsai = typename TestFixture::UpperIsai; + + auto l_isai_factory = + LowerIsai::build().with_sparsity_power(2).on(this->exec); + auto u_isai_factory = + UpperIsai::build().with_sparsity_power(2).on(this->exec); + + ASSERT_EQ(l_isai_factory->get_parameters().sparsity_power, 2); + ASSERT_EQ(u_isai_factory->get_parameters().sparsity_power, 2); +} + + +TYPED_TEST(IsaiFactory, SetsDefaultSparsityPowerCorrectly) +{ + ASSERT_EQ(this->lower_isai_factory->get_parameters().sparsity_power, 1); + ASSERT_EQ(this->upper_isai_factory->get_parameters().sparsity_power, 1); +} + + +TYPED_TEST(IsaiFactory, ThrowsWrongDimensionL) +{ + using Csr = typename TestFixture::Csr; + auto mtx = Csr::create(this->exec, gko::dim<2>{1, 2}, 1); + + ASSERT_THROW(this->lower_isai_factory->generate(gko::share(mtx)), + gko::DimensionMismatch); +} + + +TYPED_TEST(IsaiFactory, ThrowsWrongDimensionU) +{ + using Csr = typename TestFixture::Csr; + auto mtx = Csr::create(this->exec, gko::dim<2>{1, 2}, 1); + + ASSERT_THROW(this->upper_isai_factory->generate(gko::share(mtx)), + gko::DimensionMismatch); +} + + +TYPED_TEST(IsaiFactory, ThrowsNoConversionCsrL) +{ + using Csr = typename TestFixture::Csr; + auto mtx = DummyOperator::create(this->exec, gko::dim<2>{2, 2}); + + ASSERT_THROW(this->lower_isai_factory->generate(gko::share(mtx)), + gko::NotSupported); +} + + +TYPED_TEST(IsaiFactory, ThrowsNoConversionCsrU) +{ + using Csr = typename TestFixture::Csr; + auto mtx = DummyOperator::create(this->exec, gko::dim<2>{2, 2}); + + ASSERT_THROW(this->upper_isai_factory->generate(gko::share(mtx)), + gko::NotSupported); +} + + +} // namespace diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp index d2c76f77bd4..0af14acbabf 100644 --- a/core/test/preconditioner/jacobi.cpp +++ b/core/test/preconditioner/jacobi.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,19 +39,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class JacobiFactory : public ::testing::Test { protected: - using Bj = gko::preconditioner::Jacobi<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Bj = gko::preconditioner::Jacobi; JacobiFactory() : exec(gko::ReferenceExecutor::create()), bj_factory(Bj::build().with_max_block_size(3u).on(exec)), block_pointers(exec, 2), block_precisions(exec, 2), - mtx(gko::matrix::Csr<>::create(exec, gko::dim<2>{5, 5}, 13)) + mtx(gko::matrix::Csr::create( + exec, gko::dim<2>{5, 5}, 13)) { block_pointers.get_data()[0] = 2; block_pointers.get_data()[1] = 3; @@ -60,31 +69,34 @@ class JacobiFactory : public ::testing::Test { } std::shared_ptr exec; - std::unique_ptr bj_factory; - gko::Array block_pointers; + std::unique_ptr bj_factory; + gko::Array block_pointers; gko::Array block_precisions; - std::shared_ptr> mtx; + std::shared_ptr> mtx; }; +TYPED_TEST_CASE(JacobiFactory, gko::test::ValueIndexTypes); -TEST_F(JacobiFactory, KnowsItsExecutor) + +TYPED_TEST(JacobiFactory, KnowsItsExecutor) { - ASSERT_EQ(bj_factory->get_executor(), exec); + ASSERT_EQ(this->bj_factory->get_executor(), this->exec); } -TEST_F(JacobiFactory, SavesMaximumBlockSize) +TYPED_TEST(JacobiFactory, SavesMaximumBlockSize) { - ASSERT_EQ(bj_factory->get_parameters().max_block_size, 3); + ASSERT_EQ(this->bj_factory->get_parameters().max_block_size, 3); } -TEST_F(JacobiFactory, CanSetBlockPointers) +TYPED_TEST(JacobiFactory, CanSetBlockPointers) { + using Bj = typename TestFixture::Bj; auto bj_factory = Bj::build() .with_max_block_size(3u) - .with_block_pointers(block_pointers) - .on(exec); + .with_block_pointers(this->block_pointers) + .on(this->exec); auto ptrs = bj_factory->get_parameters().block_pointers; EXPECT_EQ(ptrs.get_data()[0], 2); @@ -92,12 +104,13 @@ TEST_F(JacobiFactory, CanSetBlockPointers) } -TEST_F(JacobiFactory, CanMoveBlockPointers) +TYPED_TEST(JacobiFactory, CanMoveBlockPointers) { + using Bj = typename TestFixture::Bj; auto bj_factory = Bj::build() .with_max_block_size(3u) - .with_block_pointers(std::move(block_pointers)) - .on(exec); + .with_block_pointers(std::move(this->block_pointers)) + .on(this->exec); auto ptrs = bj_factory->get_parameters().block_pointers; EXPECT_EQ(ptrs.get_data()[0], 2); @@ -105,12 +118,13 @@ TEST_F(JacobiFactory, CanMoveBlockPointers) } -TEST_F(JacobiFactory, CanSetBlockPrecisions) +TYPED_TEST(JacobiFactory, CanSetBlockPrecisions) { + using Bj = typename TestFixture::Bj; auto bj_factory = Bj::build() .with_max_block_size(3u) - .with_storage_optimization(block_precisions) - .on(exec); + .with_storage_optimization(this->block_precisions) + .on(this->exec); auto prec = bj_factory->get_parameters().storage_optimization.block_wise; EXPECT_EQ(prec.get_data()[0], gko::precision_reduction(0, 1)); @@ -118,13 +132,14 @@ TEST_F(JacobiFactory, CanSetBlockPrecisions) } -TEST_F(JacobiFactory, CanMoveBlockPrecisions) +TYPED_TEST(JacobiFactory, CanMoveBlockPrecisions) { + using Bj = typename TestFixture::Bj; auto bj_factory = Bj::build() .with_max_block_size(3u) - .with_storage_optimization(std::move(block_precisions)) - .on(exec); + .with_storage_optimization(std::move(this->block_precisions)) + .on(this->exec); auto prec = bj_factory->get_parameters().storage_optimization.block_wise; EXPECT_EQ(prec.get_data()[0], gko::precision_reduction(0, 1)); @@ -132,41 +147,46 @@ TEST_F(JacobiFactory, CanMoveBlockPrecisions) } +template class BlockInterleavedStorageScheme : public ::testing::Test { protected: + using index_type = T; // groups of 4 blocks, offset of 3 within the group and 16 between groups - gko::preconditioner::block_interleaved_storage_scheme s{3, 16, + gko::preconditioner::block_interleaved_storage_scheme s{3, 16, 2}; }; +TYPED_TEST_CASE(BlockInterleavedStorageScheme, gko::test::IndexTypes); + -TEST_F(BlockInterleavedStorageScheme, ComputesStorageSpace) +TYPED_TEST(BlockInterleavedStorageScheme, ComputesStorageSpace) { - ASSERT_EQ(s.compute_storage_space(10), 16 * 3); // 3 groups of 16 elements + ASSERT_EQ(this->s.compute_storage_space(10), + 16 * 3); // 3 groups of 16 elements } -TEST_F(BlockInterleavedStorageScheme, ComputesGroupOffset) +TYPED_TEST(BlockInterleavedStorageScheme, ComputesGroupOffset) { - ASSERT_EQ(s.get_group_offset(17), 16 * 4); // 5th group + ASSERT_EQ(this->s.get_group_offset(17), 16 * 4); // 5th group } -TEST_F(BlockInterleavedStorageScheme, ComputesBlockOffset) +TYPED_TEST(BlockInterleavedStorageScheme, ComputesBlockOffset) { - ASSERT_EQ(s.get_block_offset(17), 1 * 3); // 2nd in group + ASSERT_EQ(this->s.get_block_offset(17), 1 * 3); // 2nd in group } -TEST_F(BlockInterleavedStorageScheme, ComputesGlobalBlockOffset) +TYPED_TEST(BlockInterleavedStorageScheme, ComputesGlobalBlockOffset) { - ASSERT_EQ(s.get_global_block_offset(17), 16 * 4 + 1 * 3); + ASSERT_EQ(this->s.get_global_block_offset(17), 16 * 4 + 1 * 3); } -TEST_F(BlockInterleavedStorageScheme, ComputesStride) +TYPED_TEST(BlockInterleavedStorageScheme, ComputesStride) { - ASSERT_EQ(s.get_stride(), 4 * 3); // 4 offsets of 3 + ASSERT_EQ(this->s.get_stride(), 4 * 3); // 4 offsets of 3 } diff --git a/core/test/solver/CMakeLists.txt b/core/test/solver/CMakeLists.txt index b8b6e6fe596..e017edd6bee 100644 --- a/core/test/solver/CMakeLists.txt +++ b/core/test/solver/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_test(bicg) ginkgo_create_test(bicgstab) ginkgo_create_test(cg) ginkgo_create_test(cgs) diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp new file mode 100644 index 00000000000..0d8763cdc86 --- /dev/null +++ b/core/test/solver/bicg.cpp @@ -0,0 +1,303 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class Bicg : public ::testing::Test { +protected: + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Bicg; + + Bicg() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::initialize( + {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), + bicg_factory( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(gko::remove_complex{1e-6}) + .on(exec)) + .on(exec)), + solver(bicg_factory->generate(mtx)) + {} + + std::shared_ptr exec; + std::shared_ptr mtx; + std::unique_ptr bicg_factory; + std::unique_ptr solver; + + static void assert_same_matrices(const Mtx *m1, const Mtx *m2) + { + ASSERT_EQ(m1->get_size()[0], m2->get_size()[0]); + ASSERT_EQ(m1->get_size()[1], m2->get_size()[1]); + for (gko::size_type i = 0; i < m1->get_size()[0]; ++i) { + for (gko::size_type j = 0; j < m2->get_size()[1]; ++j) { + EXPECT_EQ(m1->at(i, j), m2->at(i, j)); + } + } + } +}; + +TYPED_TEST_CASE(Bicg, gko::test::ValueTypes); + + +TYPED_TEST(Bicg, BicgFactoryKnowsItsExecutor) +{ + ASSERT_EQ(this->bicg_factory->get_executor(), this->exec); +} + + +TYPED_TEST(Bicg, BicgFactoryCreatesCorrectSolver) +{ + using Solver = typename TestFixture::Solver; + + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto bicg_solver = static_cast(this->solver.get()); + ASSERT_NE(bicg_solver->get_system_matrix(), nullptr); + ASSERT_EQ(bicg_solver->get_system_matrix(), this->mtx); +} + + +TYPED_TEST(Bicg, CanBeCopied) +{ + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->bicg_factory->generate(Mtx::create(this->exec)); + + copy->copy_from(this->solver.get()); + + ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); + auto copy_mtx = static_cast(copy.get())->get_system_matrix(); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); +} + + +TYPED_TEST(Bicg, CanBeMoved) +{ + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->bicg_factory->generate(Mtx::create(this->exec)); + + copy->copy_from(std::move(this->solver)); + + ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); + auto copy_mtx = static_cast(copy.get())->get_system_matrix(); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); +} + + +TYPED_TEST(Bicg, CanBeCloned) +{ + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); + + ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); + auto clone_mtx = static_cast(clone.get())->get_system_matrix(); + this->assert_same_matrices(static_cast(clone_mtx.get()), + this->mtx.get()); +} + + +TYPED_TEST(Bicg, CanBeCleared) +{ + using Solver = typename TestFixture::Solver; + this->solver->clear(); + + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); + ASSERT_EQ(solver_mtx, nullptr); +} + + +TYPED_TEST(Bicg, ApplyUsesInitialGuessReturnsTrue) +{ + using Solver = typename TestFixture::Solver; + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Bicg, CanSetPreconditionerGenerator) +{ + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; + auto bicg_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor( + gko::remove_complex(1e-6)) + .on(this->exec)) + .with_preconditioner( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + auto solver = bicg_factory->generate(this->mtx); + auto precond = dynamic_cast *>( + static_cast *>(solver.get()) + ->get_preconditioner() + .get()); + + ASSERT_NE(precond, nullptr); + ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3)); + ASSERT_EQ(precond->get_system_matrix(), this->mtx); +} + + +TYPED_TEST(Bicg, CanSetPreconditionerInFactory) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr bicg_precond = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); + + auto bicg_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_generated_preconditioner(bicg_precond) + .on(this->exec); + auto solver = bicg_factory->generate(this->mtx); + auto precond = solver->get_preconditioner(); + + ASSERT_NE(precond.get(), nullptr); + ASSERT_EQ(precond.get(), bicg_precond.get()); +} + + +TYPED_TEST(Bicg, CanSetCriteriaAgain) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto bicg_factory = Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((bicg_factory->get_parameters().criteria).back(), init_crit); + + auto solver = bicg_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); +} + + +TYPED_TEST(Bicg, ThrowsOnWrongPreconditionerInFactory) +{ + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); + std::shared_ptr bicg_precond = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(wrong_sized_mtx); + + auto bicg_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_generated_preconditioner(bicg_precond) + .on(this->exec); + + ASSERT_THROW(bicg_factory->generate(this->mtx), gko::DimensionMismatch); +} + + +TYPED_TEST(Bicg, CanSetPreconditioner) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr bicg_precond = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); + + auto bicg_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = bicg_factory->generate(this->mtx); + solver->set_preconditioner(bicg_precond); + auto precond = solver->get_preconditioner(); + + ASSERT_NE(precond.get(), nullptr); + ASSERT_EQ(precond.get(), bicg_precond.get()); +} + + +} // namespace diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp index a582c1e5264..16d5b8a9bff 100644 --- a/core/test/solver/bicgstab.cpp +++ b/core/test/solver/bicgstab.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,17 +40,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include +#include "core/test/utils.hpp" + + namespace { +template class Bicgstab : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Bicgstab<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Bicgstab; Bicgstab() : exec(gko::ReferenceExecutor::create()), @@ -60,8 +65,8 @@ class Bicgstab : public ::testing::Test { Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(gko::remove_complex{1e-6}) .on(exec)) .on(exec)), solver(bicgstab_factory->generate(mtx)) @@ -69,7 +74,7 @@ class Bicgstab : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr bicgstab_factory; + std::unique_ptr bicgstab_factory; std::unique_ptr solver; static void assert_same_matrices(const Mtx *m1, const Mtx *m2) @@ -84,101 +89,154 @@ class Bicgstab : public ::testing::Test { } }; +TYPED_TEST_CASE(Bicgstab, gko::test::ValueTypes); + -TEST_F(Bicgstab, BicgstabFactoryKnowsItsExecutor) +TYPED_TEST(Bicgstab, BicgstabFactoryKnowsItsExecutor) { - ASSERT_EQ(bicgstab_factory->get_executor(), exec); + ASSERT_EQ(this->bicgstab_factory->get_executor(), this->exec); } -TEST_F(Bicgstab, BicgstabFactoryCreatesCorrectSolver) +TYPED_TEST(Bicgstab, BicgstabFactoryCreatesCorrectSolver) { - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); - auto bicgstab_solver = static_cast(solver.get()); + using Solver = typename TestFixture::Solver; + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto bicgstab_solver = static_cast(this->solver.get()); ASSERT_NE(bicgstab_solver->get_system_matrix(), nullptr); - ASSERT_EQ(bicgstab_solver->get_system_matrix(), mtx); + ASSERT_EQ(bicgstab_solver->get_system_matrix(), this->mtx); } -TEST_F(Bicgstab, CanBeCopied) +TYPED_TEST(Bicgstab, CanBeCopied) { - auto copy = bicgstab_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->bicgstab_factory->generate(Mtx::create(this->exec)); - copy->copy_from(solver.get()); + copy->copy_from(this->solver.get()); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Bicgstab, CanBeMoved) +TYPED_TEST(Bicgstab, CanBeMoved) { - auto copy = bicgstab_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->bicgstab_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Bicgstab, CanBeCloned) +TYPED_TEST(Bicgstab, CanBeCloned) { - auto clone = solver->clone(); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); auto clone_mtx = static_cast(clone.get())->get_system_matrix(); - assert_same_matrices(static_cast(clone_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(clone_mtx.get()), + this->mtx.get()); } -TEST_F(Bicgstab, CanBeCleared) +TYPED_TEST(Bicgstab, CanBeCleared) { - solver->clear(); + using Solver = typename TestFixture::Solver; + this->solver->clear(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); - auto solver_mtx = static_cast(solver.get())->get_system_matrix(); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); } -TEST_F(Bicgstab, CanSetPreconditionerGenerator) +TYPED_TEST(Bicgstab, ApplyUsesInitialGuessReturnsTrue) { + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Bicgstab, CanSetPreconditionerGenerator) +{ + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .with_preconditioner(Solver::build().on(exec)) - .on(exec); - - auto solver = bicgstab_factory->generate(mtx); - auto precond = dynamic_cast *>( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_preconditioner( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + + auto solver = bicgstab_factory->generate(this->mtx); + auto precond = dynamic_cast *>( gko::lend(solver->get_preconditioner())); ASSERT_NE(precond, nullptr); ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3)); - ASSERT_EQ(precond->get_system_matrix(), mtx); + ASSERT_EQ(precond->get_system_matrix(), this->mtx); +} + + +TYPED_TEST(Bicgstab, CanSetCriteriaAgain) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto bicgstab_factory = + Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((bicgstab_factory->get_parameters().criteria).back(), init_crit); + + auto solver = bicgstab_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); } -TEST_F(Bicgstab, CanSetPreconditionerInFactory) +TYPED_TEST(Bicgstab, CanSetPreconditionerInFactory) { + using Solver = typename TestFixture::Solver; std::shared_ptr bicgstab_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(bicgstab_precond) - .on(exec); - auto solver = bicgstab_factory->generate(mtx); + .on(this->exec); + auto solver = bicgstab_factory->generate(this->mtx); auto precond = solver->get_preconditioner(); ASSERT_NE(precond.get(), nullptr); @@ -186,42 +244,46 @@ TEST_F(Bicgstab, CanSetPreconditionerInFactory) } -TEST_F(Bicgstab, ThrowsOnWrongPreconditionerInFactory) +TYPED_TEST(Bicgstab, ThrowsOnWrongPreconditionerInFactory) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr bicgstab_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(bicgstab_precond) - .on(exec); + .on(this->exec); - ASSERT_THROW(bicgstab_factory->generate(mtx), gko::DimensionMismatch); + ASSERT_THROW(bicgstab_factory->generate(this->mtx), gko::DimensionMismatch); } -TEST_F(Bicgstab, CanSetPreconditioner) +TYPED_TEST(Bicgstab, CanSetPreconditioner) { + using Solver = typename TestFixture::Solver; std::shared_ptr bicgstab_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = bicgstab_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = bicgstab_factory->generate(this->mtx); solver->set_preconditioner(bicgstab_precond); auto precond = solver->get_preconditioner(); diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp index 9461ba0fc5a..e6652defb0e 100644 --- a/core/test/solver/cg.cpp +++ b/core/test/solver/cg.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,16 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include + + +#include "core/test/utils.hpp" namespace { +template class Cg : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Cg<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Cg; Cg() : exec(gko::ReferenceExecutor::create()), @@ -62,8 +67,8 @@ class Cg : public ::testing::Test { Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(gko::remove_complex{1e-6}) .on(exec)) .on(exec)), solver(cg_factory->generate(mtx)) @@ -71,7 +76,7 @@ class Cg : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr cg_factory; + std::unique_ptr cg_factory; std::unique_ptr solver; static void assert_same_matrices(const Mtx *m1, const Mtx *m2) @@ -86,105 +91,135 @@ class Cg : public ::testing::Test { } }; +TYPED_TEST_CASE(Cg, gko::test::ValueTypes); -TEST_F(Cg, CgFactoryKnowsItsExecutor) + +TYPED_TEST(Cg, CgFactoryKnowsItsExecutor) { - ASSERT_EQ(cg_factory->get_executor(), exec); + ASSERT_EQ(this->cg_factory->get_executor(), this->exec); } -TEST_F(Cg, CgFactoryCreatesCorrectSolver) +TYPED_TEST(Cg, CgFactoryCreatesCorrectSolver) { - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); - auto cg_solver = static_cast(solver.get()); + using Solver = typename TestFixture::Solver; + + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto cg_solver = static_cast(this->solver.get()); ASSERT_NE(cg_solver->get_system_matrix(), nullptr); - ASSERT_EQ(cg_solver->get_system_matrix(), mtx); + ASSERT_EQ(cg_solver->get_system_matrix(), this->mtx); } -TEST_F(Cg, CanBeCopied) +TYPED_TEST(Cg, CanBeCopied) { - auto copy = cg_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->cg_factory->generate(Mtx::create(this->exec)); - copy->copy_from(solver.get()); + copy->copy_from(this->solver.get()); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Cg, CanBeMoved) +TYPED_TEST(Cg, CanBeMoved) { - auto copy = cg_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->cg_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Cg, CanBeCloned) +TYPED_TEST(Cg, CanBeCloned) { - auto clone = solver->clone(); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); auto clone_mtx = static_cast(clone.get())->get_system_matrix(); - assert_same_matrices(static_cast(clone_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(clone_mtx.get()), + this->mtx.get()); } -TEST_F(Cg, CanBeCleared) +TYPED_TEST(Cg, CanBeCleared) { - solver->clear(); + using Solver = typename TestFixture::Solver; + this->solver->clear(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); - auto solver_mtx = static_cast(solver.get())->get_system_matrix(); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); } -TEST_F(Cg, CanSetPreconditionerGenerator) +TYPED_TEST(Cg, ApplyUsesInitialGuessReturnsTrue) +{ + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Cg, CanSetPreconditionerGenerator) { + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) - .on(exec)) - .with_preconditioner(Solver::build().on(exec)) - .on(exec); - auto solver = cg_factory->generate(mtx); - auto precond = dynamic_cast *>( - static_cast *>(solver.get()) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor( + gko::remove_complex(1e-6)) + .on(this->exec)) + .with_preconditioner( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + auto solver = cg_factory->generate(this->mtx); + auto precond = dynamic_cast *>( + static_cast *>(solver.get()) ->get_preconditioner() .get()); ASSERT_NE(precond, nullptr); ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3)); - ASSERT_EQ(precond->get_system_matrix(), mtx); + ASSERT_EQ(precond->get_system_matrix(), this->mtx); } -TEST_F(Cg, CanSetPreconditionerInFactory) +TYPED_TEST(Cg, CanSetPreconditionerInFactory) { + using Solver = typename TestFixture::Solver; std::shared_ptr cg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(cg_precond) - .on(exec); - auto solver = cg_factory->generate(mtx); + .on(this->exec); + auto solver = cg_factory->generate(this->mtx); auto precond = solver->get_preconditioner(); ASSERT_NE(precond.get(), nullptr); @@ -192,42 +227,70 @@ TEST_F(Cg, CanSetPreconditionerInFactory) } -TEST_F(Cg, ThrowsOnWrongPreconditionerInFactory) +TYPED_TEST(Cg, CanSetCriteriaAgain) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto cg_factory = Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((cg_factory->get_parameters().criteria).back(), init_crit); + + auto solver = cg_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); +} + + +TYPED_TEST(Cg, ThrowsOnWrongPreconditionerInFactory) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr cg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(cg_precond) - .on(exec); + .on(this->exec); - ASSERT_THROW(cg_factory->generate(mtx), gko::DimensionMismatch); + ASSERT_THROW(cg_factory->generate(this->mtx), gko::DimensionMismatch); } -TEST_F(Cg, CanSetPreconditioner) +TYPED_TEST(Cg, CanSetPreconditioner) { + using Solver = typename TestFixture::Solver; std::shared_ptr cg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = cg_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = cg_factory->generate(this->mtx); solver->set_preconditioner(cg_precond); auto precond = solver->get_preconditioner(); diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp index cc65423ddf8..04f7c31aab9 100644 --- a/core/test/solver/cgs.cpp +++ b/core/test/solver/cgs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,16 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include + + +#include "core/test/utils.hpp" namespace { +template class Cgs : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Cgs<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Cgs; Cgs() : exec(gko::ReferenceExecutor::create()), @@ -62,8 +67,8 @@ class Cgs : public ::testing::Test { Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(gko::remove_complex{1e-6}) .on(exec)) .on(exec)), solver(cgs_factory->generate(mtx)) @@ -71,7 +76,7 @@ class Cgs : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr cgs_factory; + std::unique_ptr cgs_factory; std::unique_ptr solver; static void assert_same_matrices(const Mtx *m1, const Mtx *m2) @@ -86,105 +91,159 @@ class Cgs : public ::testing::Test { } }; +TYPED_TEST_CASE(Cgs, gko::test::ValueTypes); -TEST_F(Cgs, CgsFactoryKnowsItsExecutor) + +TYPED_TEST(Cgs, CgsFactoryKnowsItsExecutor) { - ASSERT_EQ(cgs_factory->get_executor(), exec); + ASSERT_EQ(this->cgs_factory->get_executor(), this->exec); } -TEST_F(Cgs, CgsFactoryCreatesCorrectSolver) +TYPED_TEST(Cgs, CgsFactoryCreatesCorrectSolver) { - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); - auto cgs_solver = static_cast(solver.get()); + using Solver = typename TestFixture::Solver; + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto cgs_solver = static_cast(this->solver.get()); ASSERT_NE(cgs_solver->get_system_matrix(), nullptr); - ASSERT_EQ(cgs_solver->get_system_matrix(), mtx); + ASSERT_EQ(cgs_solver->get_system_matrix(), this->mtx); } -TEST_F(Cgs, CanBeCopied) +TYPED_TEST(Cgs, CanBeCopied) { - auto copy = cgs_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->cgs_factory->generate(Mtx::create(this->exec)); - copy->copy_from(solver.get()); + copy->copy_from(this->solver.get()); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Cgs, CanBeMoved) +TYPED_TEST(Cgs, CanBeMoved) { - auto copy = cgs_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->cgs_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Cgs, CanBeCloned) +TYPED_TEST(Cgs, CanBeCloned) { - auto clone = solver->clone(); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); auto clone_mtx = static_cast(clone.get())->get_system_matrix(); - assert_same_matrices(static_cast(clone_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(clone_mtx.get()), + this->mtx.get()); } -TEST_F(Cgs, CanBeCleared) +TYPED_TEST(Cgs, CanBeCleared) { - solver->clear(); + using Solver = typename TestFixture::Solver; + this->solver->clear(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); - auto solver_mtx = static_cast(solver.get())->get_system_matrix(); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); } -TEST_F(Cgs, CanSetPreconditionerGenerator) +TYPED_TEST(Cgs, ApplyUsesInitialGuessReturnsTrue) +{ + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Cgs, CanSetPreconditionerGenerator) { + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) - .on(exec)) - .with_preconditioner(Solver::build().on(exec)) - .on(exec); - auto solver = cgs_factory->generate(mtx); - auto precond = dynamic_cast *>( - static_cast *>(solver.get()) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor( + gko::remove_complex(1e-6)) + .on(this->exec)) + .with_preconditioner( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + auto solver = cgs_factory->generate(this->mtx); + auto precond = dynamic_cast *>( + static_cast *>(solver.get()) ->get_preconditioner() .get()); ASSERT_NE(precond, nullptr); ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3)); - ASSERT_EQ(precond->get_system_matrix(), mtx); + ASSERT_EQ(precond->get_system_matrix(), this->mtx); +} + + +TYPED_TEST(Cgs, CanSetCriteriaAgain) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto cgs_factory = Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((cgs_factory->get_parameters().criteria).back(), init_crit); + + auto solver = cgs_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); } -TEST_F(Cgs, CanSetPreconditionerInFactory) +TYPED_TEST(Cgs, CanSetPreconditionerInFactory) { + using Solver = typename TestFixture::Solver; std::shared_ptr cgs_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(cgs_precond) - .on(exec); - auto solver = cgs_factory->generate(mtx); + .on(this->exec); + auto solver = cgs_factory->generate(this->mtx); auto precond = solver->get_preconditioner(); ASSERT_NE(precond.get(), nullptr); @@ -192,42 +251,46 @@ TEST_F(Cgs, CanSetPreconditionerInFactory) } -TEST_F(Cgs, ThrowsOnWrongPreconditionerInFactory) +TYPED_TEST(Cgs, ThrowsOnWrongPreconditionerInFactory) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr cgs_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(cgs_precond) - .on(exec); + .on(this->exec); - ASSERT_THROW(cgs_factory->generate(mtx), gko::DimensionMismatch); + ASSERT_THROW(cgs_factory->generate(this->mtx), gko::DimensionMismatch); } -TEST_F(Cgs, CanSetPreconditioner) +TYPED_TEST(Cgs, CanSetPreconditioner) { + using Solver = typename TestFixture::Solver; std::shared_ptr cgs_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = cgs_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = cgs_factory->generate(this->mtx); solver->set_preconditioner(cgs_precond); auto precond = solver->get_preconditioner(); diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp index bf4618b1a89..6b9c0e954a7 100644 --- a/core/test/solver/fcg.cpp +++ b/core/test/solver/fcg.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,21 +36,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include -#include +#include + + +#include "core/test/utils.hpp" namespace { +template class Fcg : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Fcg<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Fcg; Fcg() : exec(gko::ReferenceExecutor::create()), @@ -60,8 +64,8 @@ class Fcg : public ::testing::Test { Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(gko::remove_complex{1e-6}) .on(exec)) .on(exec)), solver(fcg_factory->generate(mtx)) @@ -69,112 +73,162 @@ class Fcg : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr fcg_factory; + std::unique_ptr fcg_factory; std::unique_ptr solver; }; +TYPED_TEST_CASE(Fcg, gko::test::ValueTypes); -TEST_F(Fcg, FcgFactoryKnowsItsExecutor) + +TYPED_TEST(Fcg, FcgFactoryKnowsItsExecutor) { - ASSERT_EQ(fcg_factory->get_executor(), exec); + ASSERT_EQ(this->fcg_factory->get_executor(), this->exec); } -TEST_F(Fcg, FcgFactoryCreatesCorrectSolver) +TYPED_TEST(Fcg, FcgFactoryCreatesCorrectSolver) { - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); - auto fcg_solver = dynamic_cast(solver.get()); + using Solver = typename TestFixture::Solver; + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto fcg_solver = dynamic_cast(this->solver.get()); ASSERT_NE(fcg_solver->get_system_matrix(), nullptr); - ASSERT_EQ(fcg_solver->get_system_matrix(), mtx); + ASSERT_EQ(fcg_solver->get_system_matrix(), this->mtx); } -TEST_F(Fcg, CanBeCopied) +TYPED_TEST(Fcg, CanBeCopied) { - auto copy = fcg_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->fcg_factory->generate(Mtx::create(this->exec)); - copy->copy_from(solver.get()); + copy->copy_from(this->solver.get()); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = dynamic_cast(copy.get())->get_system_matrix(); - GKO_ASSERT_MTX_NEAR(dynamic_cast(copy_mtx.get()), mtx.get(), - 1e-14); + GKO_ASSERT_MTX_NEAR(dynamic_cast(copy_mtx.get()), + this->mtx.get(), 0.0); } -TEST_F(Fcg, CanBeMoved) +TYPED_TEST(Fcg, CanBeMoved) { - auto copy = fcg_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->fcg_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = dynamic_cast(copy.get())->get_system_matrix(); - GKO_ASSERT_MTX_NEAR(dynamic_cast(copy_mtx.get()), mtx.get(), - 1e-14); + GKO_ASSERT_MTX_NEAR(dynamic_cast(copy_mtx.get()), + this->mtx.get(), 0.0); } -TEST_F(Fcg, CanBeCloned) +TYPED_TEST(Fcg, CanBeCloned) { - auto clone = solver->clone(); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); auto clone_mtx = dynamic_cast(clone.get())->get_system_matrix(); - GKO_ASSERT_MTX_NEAR(dynamic_cast(clone_mtx.get()), mtx.get(), - 1e-14); + GKO_ASSERT_MTX_NEAR(dynamic_cast(clone_mtx.get()), + this->mtx.get(), 0.0); } -TEST_F(Fcg, CanBeCleared) +TYPED_TEST(Fcg, CanBeCleared) { - solver->clear(); + using Solver = typename TestFixture::Solver; + this->solver->clear(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); - auto solver_mtx = static_cast(solver.get())->get_system_matrix(); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); } -TEST_F(Fcg, CanSetPreconditionerGenerator) +TYPED_TEST(Fcg, ApplyUsesInitialGuessReturnsTrue) +{ + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Fcg, CanSetPreconditionerGenerator) { + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) - .on(exec)) - .with_preconditioner(Solver::build().on(exec)) - .on(exec); - auto solver = fcg_factory->generate(mtx); - auto precond = dynamic_cast *>( - static_cast *>(solver.get()) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor( + gko::remove_complex(1e-6)) + .on(this->exec)) + .with_preconditioner( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + auto solver = fcg_factory->generate(this->mtx); + auto precond = dynamic_cast *>( + static_cast *>(solver.get()) ->get_preconditioner() .get()); ASSERT_NE(precond, nullptr); ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3)); - ASSERT_EQ(precond->get_system_matrix(), mtx); + ASSERT_EQ(precond->get_system_matrix(), this->mtx); +} + + +TYPED_TEST(Fcg, CanSetCriteriaAgain) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto fcg_factory = Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((fcg_factory->get_parameters().criteria).back(), init_crit); + + auto solver = fcg_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); } -TEST_F(Fcg, CanSetPreconditionerInFactory) +TYPED_TEST(Fcg, CanSetPreconditionerInFactory) { + using Solver = typename TestFixture::Solver; std::shared_ptr fcg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(fcg_precond) - .on(exec); - auto solver = fcg_factory->generate(mtx); + .on(this->exec); + auto solver = fcg_factory->generate(this->mtx); auto precond = solver->get_preconditioner(); ASSERT_NE(precond.get(), nullptr); @@ -182,42 +236,46 @@ TEST_F(Fcg, CanSetPreconditionerInFactory) } -TEST_F(Fcg, ThrowsOnWrongPreconditionerInFactory) +TYPED_TEST(Fcg, ThrowsOnWrongPreconditionerInFactory) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr fcg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(fcg_precond) - .on(exec); + .on(this->exec); - ASSERT_THROW(fcg_factory->generate(mtx), gko::DimensionMismatch); + ASSERT_THROW(fcg_factory->generate(this->mtx), gko::DimensionMismatch); } -TEST_F(Fcg, CanSetPreconditioner) +TYPED_TEST(Fcg, CanSetPreconditioner) { + using Solver = typename TestFixture::Solver; std::shared_ptr fcg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = fcg_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = fcg_factory->generate(this->mtx); solver->set_preconditioner(fcg_precond); auto precond = solver->get_preconditioner(); diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index d6c6a78aab6..4765f07183b 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include #include @@ -44,18 +43,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include + + +#include "core/test/utils.hpp" namespace { +template class Gmres : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Gmres<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Gmres; using Big_solver = gko::solver::Gmres; + static constexpr gko::remove_complex reduction_factor = + gko::remove_complex(1e-6); + Gmres() : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( @@ -64,8 +71,8 @@ class Gmres : public ::testing::Test { Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .on(exec)), solver(gmres_factory->generate(mtx)), @@ -74,8 +81,8 @@ class Gmres : public ::testing::Test { .with_criteria( gko::stop::Iteration::build().with_max_iters(128u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .on(exec)), big_solver(gmres_big_factory->generate(mtx)) @@ -83,7 +90,7 @@ class Gmres : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr gmres_factory; + std::unique_ptr gmres_factory; std::unique_ptr solver; std::unique_ptr gmres_big_factory; std::unique_ptr big_solver; @@ -100,123 +107,200 @@ class Gmres : public ::testing::Test { } }; +template +constexpr gko::remove_complex Gmres::reduction_factor; + +TYPED_TEST_CASE(Gmres, gko::test::ValueTypes); -TEST_F(Gmres, GmresFactoryKnowsItsExecutor) + +TYPED_TEST(Gmres, GmresFactoryKnowsItsExecutor) { - ASSERT_EQ(gmres_factory->get_executor(), exec); + ASSERT_EQ(this->gmres_factory->get_executor(), this->exec); } -TEST_F(Gmres, GmresFactoryCreatesCorrectSolver) +TYPED_TEST(Gmres, GmresFactoryCreatesCorrectSolver) { - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); - auto gmres_solver = static_cast(solver.get()); + using Solver = typename TestFixture::Solver; + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto gmres_solver = static_cast(this->solver.get()); ASSERT_NE(gmres_solver->get_system_matrix(), nullptr); - ASSERT_EQ(gmres_solver->get_system_matrix(), mtx); + ASSERT_EQ(gmres_solver->get_system_matrix(), this->mtx); } -TEST_F(Gmres, CanBeCopied) +TYPED_TEST(Gmres, CanBeCopied) { - auto copy = gmres_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->gmres_factory->generate(Mtx::create(this->exec)); - copy->copy_from(solver.get()); + copy->copy_from(this->solver.get()); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Gmres, CanBeMoved) +TYPED_TEST(Gmres, CanBeMoved) { - auto copy = gmres_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->gmres_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Gmres, CanBeCloned) +TYPED_TEST(Gmres, CanBeCloned) { - auto clone = solver->clone(); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); auto clone_mtx = static_cast(clone.get())->get_system_matrix(); - assert_same_matrices(static_cast(clone_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(clone_mtx.get()), + this->mtx.get()); } -TEST_F(Gmres, CanBeCleared) +TYPED_TEST(Gmres, CanBeCleared) { - solver->clear(); + using Solver = typename TestFixture::Solver; + this->solver->clear(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); - auto solver_mtx = static_cast(solver.get())->get_system_matrix(); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); } -TEST_F(Gmres, CanSetPreconditionerGenerator) +TYPED_TEST(Gmres, ApplyUsesInitialGuessReturnsTrue) +{ + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Gmres, CanSetPreconditionerGenerator) { + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) - .on(exec)) - .with_preconditioner(Solver::build().on(exec)) - .on(exec); - auto solver = gmres_factory->generate(mtx); - auto precond = dynamic_cast *>( - static_cast *>(solver.get()) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(TestFixture::reduction_factor) + .on(this->exec)) + .with_preconditioner( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + auto solver = gmres_factory->generate(this->mtx); + auto precond = dynamic_cast *>( + static_cast *>(solver.get()) ->get_preconditioner() .get()); ASSERT_NE(precond, nullptr); ASSERT_EQ(precond->get_size(), gko::dim<2>(3, 3)); - ASSERT_EQ(precond->get_system_matrix(), mtx); + ASSERT_EQ(precond->get_system_matrix(), this->mtx); } -TEST_F(Gmres, CanSetKrylovDim) +TYPED_TEST(Gmres, CanSetCriteriaAgain) { + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto gmres_factory = + Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((gmres_factory->get_parameters().criteria).back(), init_crit); + + auto solver = gmres_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); +} + + +TYPED_TEST(Gmres, CanSetKrylovDim) +{ + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto gmres_factory = Solver::build() .with_krylov_dim(4u) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) - .on(exec)) - .on(exec); - auto solver = gmres_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(4u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(TestFixture::reduction_factor) + .on(this->exec)) + .on(this->exec); + auto solver = gmres_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); ASSERT_EQ(krylov_dim, 4); } -TEST_F(Gmres, CanSetPreconditionerInFactory) +TYPED_TEST(Gmres, CanSetKrylovDimAgain) +{ + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto gmres_factory = + Solver::build().with_criteria(init_crit).with_krylov_dim(10u).on( + this->exec); + + ASSERT_EQ(gmres_factory->get_parameters().krylov_dim, 10); + + auto solver = gmres_factory->generate(this->mtx); + + solver->set_krylov_dim(20); + + ASSERT_EQ(solver->get_krylov_dim(), 20); +} + + +TYPED_TEST(Gmres, CanSetPreconditionerInFactory) { + using Solver = typename TestFixture::Solver; std::shared_ptr gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(gmres_precond) - .on(exec); - auto solver = gmres_factory->generate(mtx); + .on(this->exec); + auto solver = gmres_factory->generate(this->mtx); auto precond = solver->get_preconditioner(); ASSERT_NE(precond.get(), nullptr); @@ -224,42 +308,46 @@ TEST_F(Gmres, CanSetPreconditionerInFactory) } -TEST_F(Gmres, ThrowsOnWrongPreconditionerInFactory) +TYPED_TEST(Gmres, ThrowsOnWrongPreconditionerInFactory) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_preconditioner(gmres_precond) - .on(exec); + .on(this->exec); - ASSERT_THROW(gmres_factory->generate(mtx), gko::DimensionMismatch); + ASSERT_THROW(gmres_factory->generate(this->mtx), gko::DimensionMismatch); } -TEST_F(Gmres, CanSetPreconditioner) +TYPED_TEST(Gmres, CanSetPreconditioner) { + using Solver = typename TestFixture::Solver; std::shared_ptr gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = gmres_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = gmres_factory->generate(this->mtx); solver->set_preconditioner(gmres_precond); auto precond = solver->get_preconditioner(); diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp index d38024b3806..b711c511e97 100644 --- a/core/test/solver/ir.cpp +++ b/core/test/solver/ir.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,16 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include + + +#include "core/test/utils.hpp" namespace { +template class Ir : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Ir<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Ir; Ir() : exec(gko::ReferenceExecutor::create()), @@ -62,8 +67,8 @@ class Ir : public ::testing::Test { Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), solver(ir_factory->generate(mtx)) @@ -71,7 +76,7 @@ class Ir : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr ir_factory; + std::unique_ptr ir_factory; std::unique_ptr solver; static void assert_same_matrices(const Mtx *m1, const Mtx *m2) @@ -86,103 +91,131 @@ class Ir : public ::testing::Test { } }; +TYPED_TEST_CASE(Ir, gko::test::ValueTypes); -TEST_F(Ir, IrFactoryKnowsItsExecutor) + +TYPED_TEST(Ir, IrFactoryKnowsItsExecutor) { - ASSERT_EQ(ir_factory->get_executor(), exec); + ASSERT_EQ(this->ir_factory->get_executor(), this->exec); } -TEST_F(Ir, IrFactoryCreatesCorrectSolver) +TYPED_TEST(Ir, IrFactoryCreatesCorrectSolver) { - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); - auto cg_solver = static_cast(solver.get()); + using Solver = typename TestFixture::Solver; + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); + auto cg_solver = static_cast(this->solver.get()); ASSERT_NE(cg_solver->get_system_matrix(), nullptr); - ASSERT_EQ(cg_solver->get_system_matrix(), mtx); + ASSERT_EQ(cg_solver->get_system_matrix(), this->mtx); } -TEST_F(Ir, CanBeCopied) +TYPED_TEST(Ir, CanBeCopied) { - auto copy = ir_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->ir_factory->generate(Mtx::create(this->exec)); - copy->copy_from(solver.get()); + copy->copy_from(this->solver.get()); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Ir, CanBeMoved) +TYPED_TEST(Ir, CanBeMoved) { - auto copy = ir_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = this->ir_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); auto copy_mtx = static_cast(copy.get())->get_system_matrix(); - assert_same_matrices(static_cast(copy_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(copy_mtx.get()), + this->mtx.get()); } -TEST_F(Ir, CanBeCloned) +TYPED_TEST(Ir, CanBeCloned) { - auto clone = solver->clone(); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto clone = this->solver->clone(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); auto clone_mtx = static_cast(clone.get())->get_system_matrix(); - assert_same_matrices(static_cast(clone_mtx.get()), mtx.get()); + this->assert_same_matrices(static_cast(clone_mtx.get()), + this->mtx.get()); } -TEST_F(Ir, CanBeCleared) +TYPED_TEST(Ir, CanBeCleared) { - solver->clear(); + using Solver = typename TestFixture::Solver; + this->solver->clear(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); - auto solver_mtx = static_cast(solver.get())->get_system_matrix(); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); + auto solver_mtx = + static_cast(this->solver.get())->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); } -TEST_F(Ir, CanSetInnerSolverInFactory) +TYPED_TEST(Ir, ApplyUsesInitialGuessReturnsTrue) +{ + ASSERT_TRUE(this->solver->apply_uses_initial_guess()); +} + + +TYPED_TEST(Ir, CanSetInnerSolverInFactory) { + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) - .on(exec)) - .with_solver(Solver::build().on(exec)) - .on(exec); - auto solver = ir_factory->generate(mtx); - auto inner_solver = dynamic_cast *>( - static_cast *>(solver.get())->get_solver().get()); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_solver( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on( + this->exec)) + .on(this->exec)) + .on(this->exec); + auto solver = ir_factory->generate(this->mtx); + auto inner_solver = dynamic_cast( + static_cast(solver.get())->get_solver().get()); ASSERT_NE(inner_solver, nullptr); ASSERT_EQ(inner_solver->get_size(), gko::dim<2>(3, 3)); - ASSERT_EQ(inner_solver->get_system_matrix(), mtx); + ASSERT_EQ(inner_solver->get_system_matrix(), this->mtx); } -TEST_F(Ir, CanSetGeneratedInnerSolverInFactory) +TYPED_TEST(Ir, CanSetGeneratedInnerSolverInFactory) { + using Solver = typename TestFixture::Solver; std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_solver(ir_solver) - .on(exec); - auto solver = ir_factory->generate(mtx); + .on(this->exec); + auto solver = ir_factory->generate(this->mtx); auto inner_solver = solver->get_solver(); ASSERT_NE(inner_solver.get(), nullptr); @@ -190,42 +223,70 @@ TEST_F(Ir, CanSetGeneratedInnerSolverInFactory) } -TEST_F(Ir, ThrowsOnWrongInnerSolverInFactory) +TYPED_TEST(Ir, CanSetCriteriaAgain) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Solver = typename TestFixture::Solver; + std::shared_ptr init_crit = + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec); + auto ir_factory = Solver::build().with_criteria(init_crit).on(this->exec); + + ASSERT_EQ((ir_factory->get_parameters().criteria).back(), init_crit); + + auto solver = ir_factory->generate(this->mtx); + std::shared_ptr new_crit = + gko::stop::Iteration::build().with_max_iters(5u).on(this->exec); + + solver->set_stop_criterion_factory(new_crit); + auto new_crit_fac = solver->get_stop_criterion_factory(); + auto niter = + static_cast(new_crit_fac.get()) + ->get_parameters() + .max_iters; + + ASSERT_EQ(niter, 5); +} + + +TYPED_TEST(Ir, ThrowsOnWrongInnerSolverInFactory) +{ + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) .with_generated_solver(ir_solver) - .on(exec); + .on(this->exec); - ASSERT_THROW(ir_factory->generate(mtx), gko::DimensionMismatch); + ASSERT_THROW(ir_factory->generate(this->mtx), gko::DimensionMismatch); } -TEST_F(Ir, CanSetInnerSolver) +TYPED_TEST(Ir, CanSetInnerSolver) { + using Solver = typename TestFixture::Solver; std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) - ->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) + ->generate(this->mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = ir_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = ir_factory->generate(this->mtx); solver->set_solver(ir_solver); auto inner_solver = solver->get_solver(); @@ -234,25 +295,67 @@ TEST_F(Ir, CanSetInnerSolver) } -TEST_F(Ir, ThrowOnWrongInnerSolverSet) +TYPED_TEST(Ir, ThrowOnWrongInnerSolverSet) { - std::shared_ptr wrong_sized_mtx = Mtx::create(exec, gko::dim<2>{1, 3}); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + std::shared_ptr wrong_sized_mtx = + Mtx::create(this->exec, gko::dim<2>{1, 3}); std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec) + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec) ->generate(wrong_sized_mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) - .on(exec); - auto solver = ir_factory->generate(mtx); + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .on(this->exec); + auto solver = ir_factory->generate(this->mtx); ASSERT_THROW(solver->set_solver(ir_solver), gko::DimensionMismatch); } +TYPED_TEST(Ir, DefaultRelaxationFactor) +{ + using value_type = typename TestFixture::value_type; + const value_type relaxation_factor{0.5}; + + auto richardson = + gko::solver::Richardson::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .on(this->exec) + ->generate(this->mtx); + + ASSERT_EQ(richardson->get_parameters().relaxation_factor, value_type{1}); +} + + +TYPED_TEST(Ir, UseAsRichardson) +{ + using value_type = typename TestFixture::value_type; + const value_type relaxation_factor{0.5}; + + auto richardson = + gko::solver::Richardson::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_relaxation_factor(relaxation_factor) + .on(this->exec) + ->generate(this->mtx); + + ASSERT_EQ(richardson->get_parameters().relaxation_factor, value_type{0.5}); +} + + } // namespace diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp index c32afcc6e83..be12f10ef53 100644 --- a/core/test/solver/lower_trs.cpp +++ b/core/test/solver/lower_trs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,15 +42,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class LowerTrs : public ::testing::Test { protected: - using Solver = gko::solver::LowerTrs<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Solver = gko::solver::LowerTrs; LowerTrs() : exec(gko::ReferenceExecutor::create()), @@ -58,13 +63,15 @@ class LowerTrs : public ::testing::Test { {} std::shared_ptr exec; - std::unique_ptr lower_trs_factory; + std::unique_ptr lower_trs_factory; }; +TYPED_TEST_CASE(LowerTrs, gko::test::ValueIndexTypes); -TEST_F(LowerTrs, LowerTrsFactoryKnowsItsExecutor) + +TYPED_TEST(LowerTrs, LowerTrsFactoryKnowsItsExecutor) { - ASSERT_EQ(lower_trs_factory->get_executor(), exec); + ASSERT_EQ(this->lower_trs_factory->get_executor(), this->exec); } diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp index 2e8b629e186..1ec759fa47d 100644 --- a/core/test/solver/upper_trs.cpp +++ b/core/test/solver/upper_trs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,15 +42,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class UpperTrs : public ::testing::Test { protected: - using Solver = gko::solver::UpperTrs<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Solver = gko::solver::UpperTrs; UpperTrs() : exec(gko::ReferenceExecutor::create()), @@ -58,13 +63,15 @@ class UpperTrs : public ::testing::Test { {} std::shared_ptr exec; - std::unique_ptr upper_trs_factory; + std::unique_ptr upper_trs_factory; }; +TYPED_TEST_CASE(UpperTrs, gko::test::ValueIndexTypes); -TEST_F(UpperTrs, UpperTrsFactoryKnowsItsExecutor) + +TYPED_TEST(UpperTrs, UpperTrsFactoryKnowsItsExecutor) { - ASSERT_EQ(upper_trs_factory->get_executor(), exec); + ASSERT_EQ(this->upper_trs_factory->get_executor(), this->exec); } diff --git a/core/test/stop/combined.cpp b/core/test/stop/combined.cpp index 8aff6707cc3..8a443790429 100644 --- a/core/test/stop/combined.cpp +++ b/core/test/stop/combined.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,12 +33,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include +#include +#include #include -#include + + +#include +#include namespace { @@ -86,4 +89,86 @@ TEST_F(Combined, CanCreateCriterion) } +TEST_F(Combined, CanIgnoreNullptr) +{ + auto combined = gko::stop::Combined::build() + .with_criteria(gko::stop::Iteration::build() + .with_max_iters(test_iterations) + .on(exec_), + nullptr) + .on(exec_); + + ASSERT_NO_THROW(combined->generate(nullptr, nullptr, nullptr)); +} + + +TEST_F(Combined, CanThrowAllNullptr) +{ + auto combined = + gko::stop::Combined::build().with_criteria(nullptr, nullptr).on(exec_); + + ASSERT_THROW(combined->generate(nullptr, nullptr, nullptr), + gko::NotSupported); +} + + +TEST_F(Combined, CanThrowWithoutInput) +{ + auto combined = gko::stop::Combined::build().on(exec_); + + ASSERT_THROW(combined->generate(nullptr, nullptr, nullptr), + gko::NotSupported); +} + + +TEST_F(Combined, FunctionCanThrowWithoutInput) +{ + std::vector> + criterion_vec{}; + + ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported); +} + + +TEST_F(Combined, FunctionCanThrowOnlyOneNullptr) +{ + std::vector> + criterion_vec{nullptr}; + + ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported); +} + + +TEST_F(Combined, FunctionCanThrowAllNullptr) +{ + std::vector> + criterion_vec{nullptr, nullptr}; + + ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported); +} + + +TEST_F(Combined, FunctionCanThrowFirstIsInvalid) +{ + auto stop = + gko::stop::Iteration::build().with_max_iters(test_iterations).on(exec_); + std::vector> + criterion_vec{nullptr, gko::share(stop)}; + + ASSERT_THROW(gko::stop::combine(criterion_vec), gko::NotSupported); +} + + +TEST_F(Combined, FunctionCanIgnoreNullptr) +{ + auto stop = + gko::stop::Iteration::build().with_max_iters(test_iterations).on(exec_); + std::vector> + criterion_vec{gko::share(stop), nullptr}; + auto combined = gko::stop::combine(criterion_vec); + + ASSERT_NO_THROW(combined->generate(nullptr, nullptr, nullptr)); +} + + } // namespace diff --git a/core/test/stop/iteration.cpp b/core/test/stop/iteration.cpp index 2e8733167a7..aedc443eb76 100644 --- a/core/test/stop/iteration.cpp +++ b/core/test/stop/iteration.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/stop/stopping_status.cpp b/core/test/stop/stopping_status.cpp index bc42727083b..d9cdebc165e 100644 --- a/core/test/stop/stopping_status.cpp +++ b/core/test/stop/stopping_status.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/test/stop/time.cpp b/core/test/stop/time.cpp index e45fec32f47..53966fbacad 100644 --- a/core/test/stop/time.cpp +++ b/core/test/stop/time.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,11 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include +#include + + namespace { diff --git a/core/test/utils.hpp b/core/test/utils.hpp index d03ea69fe46..89b135a01f3 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,8 +34,80 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_TEST_UTILS_HPP_ +#include +#include +#include + + +#include +#include + + #include "core/test/utils/assertions.hpp" #include "core/test/utils/matrix_generator.hpp" +namespace gko { +namespace test { + + +using ValueTypes = + ::testing::Types, std::complex>; + + +using ComplexValueTypes = + ::testing::Types, std::complex>; + + +using IndexTypes = ::testing::Types; + + +using ValueAndIndexTypes = + ::testing::Types, std::complex, + gko::int32, gko::int64, gko::size_type>; + + +using ValueIndexTypes = ::testing::Types< + std::tuple, std::tuple, + std::tuple, gko::int32>, + std::tuple, gko::int32>, std::tuple, + std::tuple, std::tuple, gko::int64>, + std::tuple, gko::int64>>; + + +using RealValueIndexTypes = ::testing::Types< + std::tuple, std::tuple, + std::tuple, std::tuple>; + + +using ComplexValueIndexTypes = + ::testing::Types, gko::int32>, + std::tuple, gko::int32>, + std::tuple, gko::int64>, + std::tuple, gko::int64>>; + + +template +struct reduction_factor { + static constexpr gko::remove_complex value = + std::is_same, float>::value ? 1.0e-7 : 1.0e-14; +}; + + +template +constexpr gko::remove_complex reduction_factor::value; + + +} // namespace test +} // namespace gko + + +template +using r = typename gko::test::reduction_factor; + + +template +using I = std::initializer_list; + + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 2ba6811921f..88b38561511 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,24 +34,104 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_TEST_UTILS_ASSERTIONS_HPP_ -#include +#include #include +#include #include +#include #include #include #include +#include + + #include #include +#include "core/base/extended_float.hpp" + + namespace gko { namespace test { namespace assertions { namespace detail { +/** + * Structure helper to return the biggest valuetype able to contain values from + * both ValueType1 and ValueType2. + * + * @tparam ValueType1 the first valuetype to compare + * @tparam ValueType2 the second valuetype to compare + * @tparam T enable_if placeholder + */ +template +struct biggest_valuetype { + /** The type. This default is good but should not be used due to the + * enable_if versions. */ + using type = std::complex; +}; + + +/** + * Specialization when both ValueType1 and ValueType2 are the same. + * + * @copydoc biggest_valuetype + */ +template +struct biggest_valuetype::value>::type> { + /** The type. */ + using type = ValueType1; +}; + + +/** + * Specialization when both ValueType1 and ValueType2 are different but non + * complex. + * + * @copydoc biggest_valuetype + */ +template +struct biggest_valuetype< + ValueType1, ValueType2, + typename std::enable_if::value && + !(gko::is_complex_s::value || + gko::is_complex_s::value)>::type> { + /** The type. We pick the bigger of the two. */ + using type = typename std::conditional::type; +}; + + +/** + * Specialization when both ValueType1 and ValueType2 are different and one of + * them is complex. + * + * @copydoc biggest_valuetype + */ +template +class biggest_valuetype< + ValueType1, ValueType2, + typename std::enable_if::value && + (gko::is_complex_s::value || + gko::is_complex_s::value)>::type> { + using real_vt1 = remove_complex; + using real_vt2 = remove_complex; + +public: + /** The type. We make a complex with the bigger real of the two. */ + using type = typename std::conditional< + xstd::greater(sizeof(real_vt1), sizeof(real_vt2)), + std::complex, std::complex>::type; +}; + + template auto get_next_value(NonzeroIterator &it, const NonzeroIterator &end, size_type next_row, size_type next_col) -> @@ -83,17 +163,23 @@ template void print_componentwise_error(Ostream &os, const MatrixData1 &first, const MatrixData2 &second) { - using real_vt = remove_complex; + using std::abs; + using vt = typename detail::biggest_valuetype< + typename MatrixData1::value_type, + typename MatrixData2::value_type>::type; + using real_vt = remove_complex; + auto first_it = begin(first.nonzeros); auto second_it = begin(second.nonzeros); for (size_type row = 0; row < first.size[0]; ++row) { os << "\t"; for (size_type col = 0; col < first.size[1]; ++col) { - auto r = get_next_value(first_it, end(first.nonzeros), row, col); - auto e = get_next_value(second_it, end(second.nonzeros), row, col); - auto m = - max(static_cast(abs(r)), static_cast(abs(e))); - if (m == zero()) { + auto r = + vt{get_next_value(first_it, end(first.nonzeros), row, col)}; + auto e = + vt{get_next_value(second_it, end(second.nonzeros), row, col)}; + auto m = std::max(abs(r), abs(e)); + if (m == zero()) { os << abs(r - e) << "\t"; } else { os << abs((r - e) / m) << "\t"; @@ -103,21 +189,37 @@ void print_componentwise_error(Ostream &os, const MatrixData1 &first, } } +template +void print_columns(Ostream &os, const Iterator &begin, const Iterator &end) +{ + for (auto it = begin; it != end; ++it) { + os << '\t' << it->column; + } + os << '\n'; +} + template double get_relative_error(const MatrixData1 &first, const MatrixData2 &second) { - double diff = 0.0; - double first_norm = 0.0; - double second_norm = 0.0; + using std::abs; + using vt = typename detail::biggest_valuetype< + typename MatrixData1::value_type, + typename MatrixData2::value_type>::type; + using real_vt = remove_complex; + + real_vt diff = 0.0; + real_vt first_norm = 0.0; + real_vt second_norm = 0.0; auto first_it = begin(first.nonzeros); auto second_it = begin(second.nonzeros); for (size_type row = 0; row < first.size[0]; ++row) { for (size_type col = 0; col < first.size[1]; ++col) { const auto first_val = - get_next_value(first_it, end(first.nonzeros), row, col); + vt{get_next_value(first_it, end(first.nonzeros), row, col)}; const auto second_val = - get_next_value(second_it, end(second.nonzeros), row, col); + vt{get_next_value(second_it, end(second.nonzeros), row, col)}; + diff += squared_norm(first_val - second_val); first_norm += squared_norm(first_val); second_norm += squared_norm(second_val); @@ -126,7 +228,7 @@ double get_relative_error(const MatrixData1 &first, const MatrixData2 &second) if (first_norm == 0.0 && second_norm == 0.0) { first_norm = 1.0; } - return sqrt(diff / max(first_norm, second_norm)); + return sqrt(diff / std::max(first_norm, second_norm)); } @@ -155,24 +257,112 @@ ::testing::AssertionResult matrices_near_impl( << second_expression << " is " << err << "\n" << "\twhich is larger than " << tolerance_expression << " (which is " << tolerance << ")\n"; - fail << first_expression << " is:\n"; - detail::print_matrix(fail, first); - fail << second_expression << " is:\n"; - detail::print_matrix(fail, second); - fail << "component-wise relative error is:\n"; - detail::print_componentwise_error(fail, first, second); + if (num_rows * num_cols <= 1000) { + fail << first_expression << " is:\n"; + detail::print_matrix(fail, first); + fail << second_expression << " is:\n"; + detail::print_matrix(fail, second); + fail << "component-wise relative error is:\n"; + detail::print_componentwise_error(fail, first, second); + } else { + // build output filenames + auto test_case_info = + ::testing::UnitTest::GetInstance()->current_test_info(); + auto testname = + test_case_info ? std::string{test_case_info->test_case_name()} + + "." + test_case_info->name() + : std::string{"null"}; + auto firstfile = testname + "." + first_expression + ".mtx"; + auto secondfile = testname + "." + second_expression + ".mtx"; + auto to_remove = [](char c) { + return !std::isalnum(c) && c != '_' && c != '.' && c != '-' && + c != '<' && c != '>'; + }; + // remove all but alphanumerical and _.-<> characters from + // expressions + firstfile.erase( + std::remove_if(firstfile.begin(), firstfile.end(), to_remove), + firstfile.end()); + secondfile.erase( + std::remove_if(secondfile.begin(), secondfile.end(), to_remove), + secondfile.end()); + // save matrices + std::ofstream first_stream{firstfile}; + gko::write_raw(first_stream, first, gko::layout_type::coordinate); + std::ofstream second_stream{secondfile}; + gko::write_raw(second_stream, second, gko::layout_type::coordinate); + fail << first_expression << " saved as " << firstfile << "\n"; + fail << second_expression << " saved as " << secondfile << "\n"; + } return fail; } } +template +::testing::AssertionResult matrices_equal_sparsity_impl( + const std::string &first_expression, const std::string &second_expression, + const MatrixData1 &first, const MatrixData2 &second) +{ + auto num_rows = first.size[0]; + auto num_cols = first.size[1]; + if (num_rows != second.size[0] || num_cols != second.size[1]) { + return ::testing::AssertionFailure() + << "Expected matrices of equal size\n\t" << first_expression + << " is of size [" << num_rows << " x " << num_cols << "]\n\t" + << second_expression << " is of size [" << second.size[0] + << " x " << second.size[1] << "]"; + } + + auto fst_it = begin(first.nonzeros); + auto snd_it = begin(second.nonzeros); + auto fst_end = end(first.nonzeros); + auto snd_end = end(second.nonzeros); + using nz_type_f = typename std::decay::type; + using nz_type_s = typename std::decay::type; + for (size_type row = 0; row < num_rows; ++row) { + auto cmp_l_f = [](nz_type_f nz, size_type row) { return nz.row < row; }; + auto cmp_u_f = [](size_type row, nz_type_f nz) { return row < nz.row; }; + auto cmp_l_s = [](nz_type_s nz, size_type row) { return nz.row < row; }; + auto cmp_u_s = [](size_type row, nz_type_s nz) { return row < nz.row; }; + auto col_eq = [](nz_type_f a, nz_type_s b) { + return a.column == b.column; + }; + auto fst_row_begin = std::lower_bound(fst_it, fst_end, row, cmp_l_f); + auto snd_row_begin = std::lower_bound(snd_it, snd_end, row, cmp_l_s); + auto fst_row_end = + std::upper_bound(fst_row_begin, fst_end, row, cmp_u_f); + auto snd_row_end = + std::upper_bound(snd_row_begin, snd_end, row, cmp_u_s); + if (std::distance(fst_row_begin, fst_row_end) != + std::distance(snd_row_begin, snd_row_end) || + !std::equal(fst_row_begin, fst_row_end, snd_row_begin, col_eq)) { + auto fail = ::testing::AssertionFailure(); + fail << "Sparsity pattern differs between " << first_expression + << " and " << second_expression << "\nIn row " << row << " " + << first_expression << " has " << (fst_row_end - fst_row_begin) + << " columns:\n"; + detail::print_columns(fail, fst_row_begin, fst_row_end); + fail << "and " << second_expression << " has " + << (snd_row_end - snd_row_begin) << " columns:\n"; + detail::print_columns(fail, snd_row_begin, snd_row_end); + return fail; + } + fst_it = fst_row_end; + snd_it = snd_row_end; + } + + return ::testing::AssertionSuccess(); +} + + template ::testing::AssertionResult array_equal_impl( const std::string &first_expression, const std::string &second_expression, - const Array *first, const Array *second) + const Array &first, const Array &second) { - const auto num_elems1 = first->get_num_elems(); - const auto num_elems2 = second->get_num_elems(); + const auto num_elems1 = first.get_num_elems(); + const auto num_elems2 = second.get_num_elems(); if (num_elems1 != num_elems2) { auto fail = ::testing::AssertionFailure(); fail << "Array " << first_expression << " contains " << num_elems1 @@ -181,10 +371,10 @@ ::testing::AssertionResult array_equal_impl( return fail; } - auto exec = first->get_executor()->get_master(); - Array first_array(exec, *first); - Array second_array(exec, *second); - for (decltype(first->get_num_elems()) i = 0; i < num_elems1; ++i) { + auto exec = first.get_executor()->get_master(); + Array first_array(exec, first); + Array second_array(exec, second); + for (decltype(first.get_num_elems()) i = 0; i < num_elems1; ++i) { if (!(first_array.get_const_data()[i] == second_array.get_const_data()[i])) { auto fail = ::testing::AssertionFailure(); @@ -246,6 +436,85 @@ std::string remove_list_wrapper(const std::string &expression) } // namespace detail +/** + * This is a gtest predicate which checks if two values are relatively near. + * + * This function should not be called directly, but used in conjunction with + * `ASSERT_PRED_FORMAT3` as follows: + * + * ``` + * // Check if first and second are near + * ASSERT_PRED_FORMAT3(gko::test::assertions::values_near, + * first, second, tolerance); + * // Check if first and second are far + * ASSERT_PRED_FORMAT3(!gko::test::assertions::values_near, + * first, second, tolerance); + * ``` + * + * @see GKO_ASSERT_MTX_NEAR + * @see GKO_EXPECT_MTX_NEAR + */ +template +::testing::AssertionResult values_near(const std::string &first_expression, + const std::string &second_expression, + const std::string &tolerance_expression, + T val1, U val2, double abs_error) +{ + static_assert(std::is_same(), + "The types of the operands should be the same."); + const double diff = abs(val1 - val2); + if (diff <= abs_error) return ::testing::AssertionSuccess(); + + return ::testing::AssertionFailure() + << "The difference between " << first_expression << " and " + << second_expression << " is " << diff << ", which exceeds " + << tolerance_expression << ", where\n" + << first_expression << " evaluates to " << val1 << ",\n" + << second_expression << " evaluates to " << val2 << ", and\n" + << tolerance_expression << " evaluates to " << abs_error << "."; +} + + +template <> +::testing::AssertionResult values_near( + const std::string &first_expression, const std::string &second_expression, + const std::string &tolerance_expression, gko::half val1, gko::half val2, + double abs_error) +{ + using T = float32; + const double diff = abs(T{val1} - T{val2}); + if (diff <= abs_error) return ::testing::AssertionSuccess(); + + return ::testing::AssertionFailure() + << "The difference between " << first_expression << " and " + << second_expression << " is " << diff << ", which exceeds " + << tolerance_expression << ", where\n" + << first_expression << " evaluates to " << T{val1} << ",\n" + << second_expression << " evaluates to " << T{val2} << ", and\n" + << tolerance_expression << " evaluates to " << abs_error << "."; +} + + +template <> +::testing::AssertionResult values_near, std::complex>( + const std::string &first_expression, const std::string &second_expression, + const std::string &tolerance_expression, std::complex val1, + std::complex val2, double abs_error) +{ + using T = std::complex; + const double diff = abs(T{val1} - T{val2}); + if (diff <= abs_error) return ::testing::AssertionSuccess(); + + return ::testing::AssertionFailure() + << "The difference between " << first_expression << " and " + << second_expression << " is " << diff << ", which exceeds " + << tolerance_expression << ", where\n" + << first_expression << " evaluates to " << T{val1} << ",\n" + << second_expression << " evaluates to " << T{val2} << ", and\n" + << tolerance_expression << " evaluates to " << abs_error << "."; +} + + /** * This is a gtest predicate which checks if two matrices are relatively near. * @@ -327,12 +596,11 @@ ::testing::AssertionResult matrices_near( template ::testing::AssertionResult array_equal(const std::string &first_expression, const std::string &second_expression, - const Array *first, - const Array *second) + const Array &first, + const Array &second) { - return detail::array_equal_impl( - detail::remove_pointer_wrapper(first_expression), - detail::remove_pointer_wrapper(second_expression), first, second); + return detail::array_equal_impl(first_expression, second_expression, first, + second); } @@ -361,6 +629,52 @@ ::testing::AssertionResult str_contains(const std::string &first_expression, } +/** + * This is a gtest predicate which checks if two matrices have the same sparsity + * pattern. + * + * This means that hat mtx1 and mtx2 have exactly the same non-zero locations + * (including zero values!) + * + * This function should not be called directly, but used in conjunction with + * `ASSERT_PRED_FORMAT2` as follows: + * + * ``` + * // Check if first and second are equal + * ASSERT_PRED_FORMAT2(gko::test::assertions::matrices_equal_sparsity, + * first, second); + * // Check if first and second are not equal + * ASSERT_PRED_FORMAT2(!gko::test::assertions::matrices_equal_sparsity, + * first, second); + * ``` + * + * @see GKO_ASSERT_MTX_NEAR + * @see GKO_EXPECT_MTX_NEAR + */ +template +::testing::AssertionResult matrices_equal_sparsity( + const std::string &first_expression, const std::string &second_expression, + const LinOp1 *first, const LinOp2 *second) +{ + auto exec = first->get_executor()->get_master(); + matrix_data + first_data; + matrix_data + second_data; + + first->write(first_data); + second->write(second_data); + + first_data.ensure_row_major_order(); + second_data.ensure_row_major_order(); + + return detail::matrices_equal_sparsity_impl( + detail::remove_pointer_wrapper(first_expression), + detail::remove_pointer_wrapper(second_expression), first_data, + second_data); +} + + namespace detail { @@ -383,7 +697,6 @@ T &&l(T &&matrix) return std::forward(matrix); } - template T *plain_ptr(const std::shared_ptr &ptr) { @@ -409,6 +722,33 @@ T plain_ptr(T ptr) } // namespace gko +/** + * Checks if two values are near each other. + * + * Has to be called from within a google test unit test. + * Internally calls gko::test::assertions::values_near(). + * + * @param _val1 first value + * @param _val2 second value + * @param _tol tolerance level + */ +#define GKO_ASSERT_NEAR(_val1, _val2, _tol) \ + { \ + ASSERT_PRED_FORMAT3(::gko::test::assertions::values_near, _val1, \ + _val2, _tol); \ + } + + +/** + * @copydoc GKO_ASSERT_NEAR + */ +#define GKO_EXPECT_NEAR(_val1, _val2, _tol) \ + { \ + EXPECT_PRED_FORMAT3(::gko::test::assertions::values_near, _val1, \ + _val2, _tol); \ + } + + /** * Checks if two matrices are near each other. * @@ -445,6 +785,38 @@ T plain_ptr(T ptr) plain_ptr(_mtx1), plain_ptr(_mtx2), _tol); \ } +/** + * Checks if two matrices have the same sparsity pattern. + * + * This means that mtx1 and mtx2 have exactly the same non-zero locations + * (including zero values!) + * + * Has to be called from within a google test unit test. + * Internally calls gko::test::assertions::matrices_equal_sparsity(). + * + * @param _mtx1 first matrix + * @param _mtx2 second matrix + */ +#define GKO_ASSERT_MTX_EQ_SPARSITY(_mtx1, _mtx2) \ + { \ + using ::gko::test::assertions::detail::l; \ + using ::gko::test::assertions::detail::plain_ptr; \ + ASSERT_PRED_FORMAT2(::gko::test::assertions::matrices_equal_sparsity, \ + plain_ptr(_mtx1), plain_ptr(_mtx2)); \ + } + + +/** + * @copydoc GKO_ASSERT_MTX_EQ_SPARSITY + */ +#define GKO_EXPECT_MTX_EQ_SPARSITY(_mtx1, _mtx2) \ + { \ + using ::gko::test::assertions::detail::l; \ + using ::gko::test::assertions::detail::plain_ptr; \ + EXPECT_PRED_FORMAT2(::gko::test::assertions::matrices_equal_sparsity, \ + plain_ptr(_mtx1), plain_ptr(_mtx2)); \ + } + /** * Checks if two `gko::Array`s are equal. @@ -458,11 +830,10 @@ T plain_ptr(T ptr) * @param _array1 first array * @param _array2 second array **/ -#define GKO_ASSERT_ARRAY_EQ(_array1, _array2) \ - { \ - using ::gko::test::assertions::detail::plain_ptr; \ - EXPECT_PRED_FORMAT2(::gko::test::assertions::array_equal, \ - plain_ptr(_array1), plain_ptr(_array2)); \ +#define GKO_ASSERT_ARRAY_EQ(_array1, _array2) \ + { \ + EXPECT_PRED_FORMAT2(::gko::test::assertions::array_equal, _array1, \ + _array2); \ } diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp index 444b9932c1f..89d15ca585a 100644 --- a/core/test/utils/assertions_test.cpp +++ b/core/test/utils/assertions_test.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,12 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include "core/test/utils/assertions.hpp" + + +#include #include +#include #include @@ -45,20 +49,52 @@ namespace { class MatricesNear : public ::testing::Test { protected: using Mtx = gko::matrix::Dense<>; + using Sparse = gko::matrix::Csr<>; + + template + gko::Array make_view(std::array &array) + { + return gko::Array::view(exec, size, array.data()); + } + MatricesNear() : exec(gko::ReferenceExecutor::create()), - mtx1(gko::initialize>( - {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, exec)), - mtx2(gko::initialize>( - {{1.0, 2.0, 3.0}, {4.0, 0.0, 4.0}}, exec)), - mtx3(gko::initialize>( - {{1.0, 2.0, 3.0}, {0.0, 4.1, 0.0}}, exec)) - {} + mtx1(gko::initialize({{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, exec)), + mtx2(gko::initialize({{1.0, 2.0, 3.0}, {4.0, 0.0, 4.0}}, exec)), + mtx3(gko::initialize({{1.0, 2.0, 3.0}, {0.0, 4.1, 0.0}}, exec)), + mtx13_row_ptrs({0, 3, 4}), + mtx2_row_ptrs({0, 3, 5}), + mtx13_col_idxs({0, 1, 2, 1}), + mtx2_col_idxs({0, 1, 2, 0, 2}), + mtx1_vals({1.0, 2.0, 3.0, 4.0}), + mtx2_vals({1.0, 2.0, 3.0, 4.0, 4.0}), + mtx3_vals({1.0, 2.0, 3.0, 4.1}) + { + mtx1_sp = Sparse::create(exec, mtx1->get_size(), make_view(mtx1_vals), + make_view(mtx13_col_idxs), + make_view(mtx13_row_ptrs)); + mtx2_sp = + Sparse::create(exec, mtx2->get_size(), make_view(mtx2_vals), + make_view(mtx2_col_idxs), make_view(mtx2_row_ptrs)); + mtx3_sp = Sparse::create(exec, mtx3->get_size(), make_view(mtx3_vals), + make_view(mtx13_col_idxs), + make_view(mtx13_row_ptrs)); + } std::shared_ptr exec; std::unique_ptr mtx1; std::unique_ptr mtx2; std::unique_ptr mtx3; + std::array mtx13_row_ptrs; + std::array mtx2_row_ptrs; + std::array mtx13_col_idxs; + std::array mtx2_col_idxs; + std::array mtx1_vals; + std::array mtx2_vals; + std::array mtx3_vals; + std::unique_ptr mtx1_sp; + std::unique_ptr mtx2_sp; + std::unique_ptr mtx3_sp; }; @@ -66,6 +102,8 @@ TEST_F(MatricesNear, SuceedsIfSame) { ASSERT_PRED_FORMAT3(gko::test::assertions::matrices_near, mtx1.get(), mtx1.get(), 0.0); + ASSERT_PRED_FORMAT2(gko::test::assertions::matrices_equal_sparsity, + mtx1_sp.get(), mtx1_sp.get()); } @@ -73,6 +111,8 @@ TEST_F(MatricesNear, FailsIfDifferent) { ASSERT_PRED_FORMAT3(!gko::test::assertions::matrices_near, mtx1.get(), mtx2.get(), 0.0); + ASSERT_PRED_FORMAT2(!gko::test::assertions::matrices_equal_sparsity, + mtx1_sp.get(), mtx2_sp.get()); } @@ -82,6 +122,8 @@ TEST_F(MatricesNear, SucceedsIfClose) mtx3.get(), 0.0); ASSERT_PRED_FORMAT3(gko::test::assertions::matrices_near, mtx1.get(), mtx3.get(), 0.1); + ASSERT_PRED_FORMAT2(gko::test::assertions::matrices_equal_sparsity, + mtx1_sp.get(), mtx3_sp.get()); } @@ -89,6 +131,8 @@ TEST_F(MatricesNear, CanUseShortNotation) { GKO_EXPECT_MTX_NEAR(mtx1, mtx1, 0.0); GKO_ASSERT_MTX_NEAR(mtx1, mtx3, 0.1); + GKO_EXPECT_MTX_EQ_SPARSITY(mtx1_sp, mtx3_sp); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx1_sp, mtx3_sp); } @@ -99,4 +143,64 @@ TEST_F(MatricesNear, CanPassInitializerList) } +TEST(BiggestValueType, SameNonComplex) +{ + using T1 = float; + using T2 = float; + using result = + gko::test::assertions::detail::biggest_valuetype::type; + + bool is_float = std::is_same::value; + ASSERT_TRUE(is_float); +} + + +TEST(BiggestValueType, BetweenNonComplex) +{ + using T1 = float; + using T2 = double; + using result = + gko::test::assertions::detail::biggest_valuetype::type; + + bool is_double = std::is_same::value; + ASSERT_TRUE(is_double); +} + + +TEST(BiggestValueType, WithSameComplex) +{ + using T1 = std::complex; + using T2 = std::complex; + using result = + gko::test::assertions::detail::biggest_valuetype::type; + + bool is_cpx_float = std::is_same>::value; + ASSERT_TRUE(is_cpx_float); +} + + +TEST(BiggestValueType, WithAComplex) +{ + using T1 = std::complex; + using T2 = double; + using result = + gko::test::assertions::detail::biggest_valuetype::type; + + bool is_cpx_double = std::is_same>::value; + ASSERT_TRUE(is_cpx_double); +} + + +TEST(BiggestValueType, WithBothComplex) +{ + using T1 = std::complex; + using T2 = std::complex; + using result = + gko::test::assertions::detail::biggest_valuetype::type; + + bool is_cpx_double = std::is_same>::value; + ASSERT_TRUE(is_cpx_double); +} + + } // namespace diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index fa994bf3f4e..171e4b2dd69 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -186,7 +186,7 @@ std::unique_ptr generate_random_sparsity_matrix( /** - * Generates a random lower triangular matrix. + * Generates a random triangular matrix. * * @tparam MatrixType type of matrix to generate (matrix::Dense must implement * the interface `ConvertibleTo`) @@ -197,6 +197,10 @@ std::unique_ptr generate_random_sparsity_matrix( * * @param num_rows number of rows * @param num_cols number of columns + * @param ones_on_diagonal `true` generates only ones on the diagonal, + * `false` generates random values on the diagonal + * @param lower_triangular `true` generates a lower triangular matrix, + * `false` an upper triangular matrix * @param nonzero_dist distribution of nonzeros per row * @param value_dist distribution of matrix values * @param engine a random engine @@ -205,11 +209,11 @@ std::unique_ptr generate_random_sparsity_matrix( */ template , typename NonzeroDistribution, typename ValueDistribution, typename Engine, typename... MatrixArgs> -std::unique_ptr generate_random_lower_triangular_matrix( +std::unique_ptr generate_random_triangular_matrix( size_type num_rows, size_type num_cols, bool ones_on_diagonal, - NonzeroDistribution &&nonzero_dist, ValueDistribution &&value_dist, - Engine &&engine, std::shared_ptr exec, - MatrixArgs &&... args) + bool lower_triangular, NonzeroDistribution &&nonzero_dist, + ValueDistribution &&value_dist, Engine &&engine, + std::shared_ptr exec, MatrixArgs &&... args) { using value_type = typename MatrixType::value_type; using index_type = typename MatrixType::index_type; @@ -229,19 +233,34 @@ std::unique_ptr generate_random_lower_triangular_matrix( // select a subset of `nnz_in_row` column indexes, and fill these // locations with random values std::shuffle(begin(col_idx), end(col_idx), engine); - std::for_each(begin(col_idx), begin(col_idx) + nnz_in_row, - [&](size_type col) { - if (col <= row) { - if (ones_on_diagonal && col == row) { - data.nonzeros.emplace_back(row, col, one); - } else { - data.nonzeros.emplace_back( - row, col, - detail::get_rand_value( - value_dist, engine)); - } - } - }); + // add non-zeros + bool has_diagonal{}; + for (size_type nz = 0; nz < nnz_in_row; ++nz) { + auto col = col_idx[nz]; + // skip non-zeros outside triangle + if ((col > row && lower_triangular) || + (col < row && !lower_triangular)) { + continue; + } + + // generate and store non-zero + auto val = detail::get_rand_value(value_dist, engine); + if (col == row) { + has_diagonal = true; + if (ones_on_diagonal) { + val = one; + } + } + data.nonzeros.emplace_back(row, col, val); + } + + // add diagonal if it hasn't been added yet + if (!has_diagonal) { + auto val = ones_on_diagonal ? one + : detail::get_rand_value( + value_dist, engine); + data.nonzeros.emplace_back(row, row, val); + } } data.ensure_row_major_order(); @@ -254,7 +273,7 @@ std::unique_ptr generate_random_lower_triangular_matrix( /** - * Generates a random upper triangular matrix. + * Generates a random lower triangular matrix. * * @tparam MatrixType type of matrix to generate (matrix::Dense must implement * the interface `ConvertibleTo`) @@ -265,6 +284,8 @@ std::unique_ptr generate_random_lower_triangular_matrix( * * @param num_rows number of rows * @param num_cols number of columns + * @param ones_on_diagonal `true` generates only ones on the diagonal, + * `false` generates random values on the diagonal * @param nonzero_dist distribution of nonzeros per row * @param value_dist distribution of matrix values * @param engine a random engine @@ -273,51 +294,49 @@ std::unique_ptr generate_random_lower_triangular_matrix( */ template , typename NonzeroDistribution, typename ValueDistribution, typename Engine, typename... MatrixArgs> -std::unique_ptr generate_random_upper_triangular_matrix( +std::unique_ptr generate_random_lower_triangular_matrix( size_type num_rows, size_type num_cols, bool ones_on_diagonal, NonzeroDistribution &&nonzero_dist, ValueDistribution &&value_dist, Engine &&engine, std::shared_ptr exec, MatrixArgs &&... args) { - using value_type = typename MatrixType::value_type; - using index_type = typename MatrixType::index_type; - using std::begin; - using std::end; - - matrix_data data{gko::dim<2>{num_rows, num_cols}, - {}}; - value_type one = 1.0; - std::vector col_idx(num_cols); - std::iota(begin(col_idx), end(col_idx), size_type(0)); - - for (size_type row = 0; row < num_rows; ++row) { - // randomly generate number of nonzeros in this row - auto nnz_in_row = static_cast(nonzero_dist(engine)); - nnz_in_row = std::max(size_type(0), std::min(nnz_in_row, num_cols)); - // select a subset of `nnz_in_row` column indexes, and fill these - // locations with random values - std::shuffle(begin(col_idx), end(col_idx), engine); - std::for_each(begin(col_idx), begin(col_idx) + nnz_in_row, - [&](size_type col) { - if (col >= row) { - if (ones_on_diagonal && col == row) { - data.nonzeros.emplace_back(row, col, one); - } else { - data.nonzeros.emplace_back( - row, col, - detail::get_rand_value( - value_dist, engine)); - } - } - }); - } + return generate_random_triangular_matrix( + num_rows, num_cols, ones_on_diagonal, true, nonzero_dist, value_dist, + engine, std::move(exec), std::forward(args)...); +} - data.ensure_row_major_order(); - // convert to the correct matrix type - auto result = MatrixType::create(exec, std::forward(args)...); - result->read(data); - return result; +/** + * Generates a random upper triangular matrix. + * + * @tparam MatrixType type of matrix to generate (matrix::Dense must implement + * the interface `ConvertibleTo`) + * @tparam NonzeroDistribution type of nonzero distribution + * @tparam ValueDistribution type of value distribution + * @tparam Engine type of random engine + * @tparam MatrixArgs the arguments from the matrix to be forwarded. + * + * @param num_rows number of rows + * @param num_cols number of columns + * @param ones_on_diagonal `true` generates only ones on the diagonal, + * `false` generates random values on the diagonal + * @param nonzero_dist distribution of nonzeros per row + * @param value_dist distribution of matrix values + * @param engine a random engine + * @param exec executor where the matrix should be allocated + * @param args additional arguments for the matrix constructor + */ +template , typename NonzeroDistribution, + typename ValueDistribution, typename Engine, typename... MatrixArgs> +std::unique_ptr generate_random_upper_triangular_matrix( + size_type num_rows, size_type num_cols, bool ones_on_diagonal, + NonzeroDistribution &&nonzero_dist, ValueDistribution &&value_dist, + Engine &&engine, std::shared_ptr exec, + MatrixArgs &&... args) +{ + return generate_random_triangular_matrix( + num_rows, num_cols, ones_on_diagonal, false, nonzero_dist, value_dist, + engine, std::move(exec), std::forward(args)...); } diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 21710886bac..8a585994dc0 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,16 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include - - -#include +#include "core/test/utils/matrix_generator.hpp" #include #include +#include + + namespace { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index af9ba8efddd..381e454fcf8 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -9,9 +9,11 @@ endif() if(MSVC) # MSVC can not find CUDA automatically # Use CUDA_COMPILER PATH to define the CUDA TOOLKIT ROOT DIR + string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER}) if("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" STREQUAL "") - string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER}) set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/include") + endif() + if("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" STREQUAL "") set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/lib/x64") endif() @@ -59,36 +61,50 @@ find_library(CUSPARSE cusparse add_library(ginkgo_cuda $ "") target_sources(ginkgo_cuda PRIVATE - base/exception.cpp - base/executor.cpp - base/version.cpp - components/zero_array.cu - factorization/par_ilu_kernels.cu - matrix/coo_kernels.cu - matrix/csr_kernels.cu - matrix/dense_kernels.cu - matrix/ell_kernels.cu - matrix/hybrid_kernels.cu - matrix/sellp_kernels.cu - matrix/sparsity_csr_kernels.cu - preconditioner/jacobi_advanced_apply_kernel.cu - preconditioner/jacobi_generate_kernel.cu - preconditioner/jacobi_kernels.cu - preconditioner/jacobi_simple_apply_kernel.cu - solver/bicgstab_kernels.cu - solver/cg_kernels.cu - solver/cgs_kernels.cu - solver/fcg_kernels.cu - solver/gmres_kernels.cu - solver/ir_kernels.cu - solver/lower_trs_kernels.cu - solver/upper_trs_kernels.cu - stop/criterion_kernels.cu - stop/residual_norm_reduction_kernels.cu) + base/exception.cpp + base/executor.cpp + base/version.cpp + components/fill_array.cu + components/precision_conversion.cu + components/prefix_sum.cu + factorization/ilu_kernels.cu + factorization/factorization_kernels.cu + factorization/par_ict_kernels.cu + factorization/par_ilu_kernels.cu + factorization/par_ilut_approx_filter_kernel.cu + factorization/par_ilut_filter_kernel.cu + factorization/par_ilut_select_kernel.cu + factorization/par_ilut_select_common.cu + factorization/par_ilut_spgeam_kernel.cu + factorization/par_ilut_sweep_kernel.cu + matrix/coo_kernels.cu + matrix/csr_kernels.cu + matrix/dense_kernels.cu + matrix/ell_kernels.cu + matrix/hybrid_kernels.cu + matrix/sellp_kernels.cu + matrix/sparsity_csr_kernels.cu + preconditioner/isai_kernels.cu + preconditioner/jacobi_advanced_apply_kernel.cu + preconditioner/jacobi_generate_kernel.cu + preconditioner/jacobi_kernels.cu + preconditioner/jacobi_simple_apply_kernel.cu + solver/bicg_kernels.cu + solver/bicgstab_kernels.cu + solver/cg_kernels.cu + solver/cgs_kernels.cu + solver/fcg_kernels.cu + solver/gmres_kernels.cu + solver/ir_kernels.cu + solver/lower_trs_kernels.cu + solver/upper_trs_kernels.cu + stop/criterion_kernels.cu + stop/residual_norm_kernels.cu) # This creates a compilation bug on nvcc 9.0.102 *with* the new array_deleter -# merged at commit ed12b3df5d26 -if(NOT CMAKE_CUDA_COMPILER_VERSION MATCHES "9.0") +# merged at commit ed12b3df5d26, and the parameter is not recognized by clang-cuda +if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND + NOT CMAKE_CUDA_COMPILER_VERSION MATCHES "9.0") # remove false positive CUDA warnings when calling one() and zero() target_compile_options(ginkgo_cuda PRIVATE @@ -107,6 +123,9 @@ target_include_directories(ginkgo_cuda SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE}) +# Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method +target_link_libraries(ginkgo_cuda PUBLIC ginkgo_hip) + cas_target_cuda_architectures(ginkgo_cuda ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} UNSUPPORTED "20" "21") @@ -114,6 +133,10 @@ cas_target_cuda_architectures(ginkgo_cuda ginkgo_default_includes(ginkgo_cuda) ginkgo_install_library(ginkgo_cuda cuda) +if (GINKGO_CHECK_CIRCULAR_DEPS) + ginkgo_check_headers(ginkgo_cuda) +endif() + if(GINKGO_BUILD_TESTS) add_subdirectory(test) endif() diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp new file mode 100644 index 00000000000..cd69b6a2c56 --- /dev/null +++ b/cuda/base/config.hpp @@ -0,0 +1,82 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_BASE_CONFIG_HPP_ +#define GKO_CUDA_BASE_CONFIG_HPP_ + + +#include + + +#include "cuda/base/math.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +struct config { + /** + * The type containing a bitmask over all lanes of a warp. + */ + using lane_mask_type = uint32; + + /** + * The number of threads within a CUDA warp. + */ + static constexpr uint32 warp_size = 32; + + /** + * The bitmask of the entire warp. + */ + static constexpr auto full_lane_mask = ~zero(); + + /** + * The maximal number of threads allowed in a CUDA warp. + */ + static constexpr uint32 max_block_size = 1024; + + /** + * The minimal amount of warps that need to be scheduled for each block + * to maximize GPU occupancy. + */ + static constexpr uint32 min_warps_per_block = 4; +}; + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_BASE_CONFIG_HPP_ diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp index 9dd4d67fa07..72a67d958e9 100644 --- a/cuda/base/cublas_bindings.hpp +++ b/cuda/base/cublas_bindings.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,7 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" -#include "cuda/components/zero_array.hpp" namespace gko { @@ -215,24 +214,9 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented); #undef GKO_BIND_CUBLAS_DOT -#define GKO_BIND_CUBLAS_COMPLEX_NORM2(ValueType, CublasName) \ - inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \ - int incx, ValueType *result) \ - { \ - zero_array(n, result); \ - GKO_ASSERT_NO_CUBLAS_ERRORS( \ - CublasName(handle, n, as_culibs_type(x), incx, \ - reinterpret_cast *>( \ - as_culibs_type(result)))); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - - #define GKO_BIND_CUBLAS_NORM2(ValueType, CublasName) \ inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \ - int incx, ValueType *result) \ + int incx, remove_complex *result) \ { \ GKO_ASSERT_NO_CUBLAS_ERRORS(CublasName(handle, n, as_culibs_type(x), \ incx, as_culibs_type(result))); \ @@ -244,8 +228,8 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented); GKO_BIND_CUBLAS_NORM2(float, cublasSnrm2); GKO_BIND_CUBLAS_NORM2(double, cublasDnrm2); -GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex, cublasScnrm2); -GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex, cublasDznrm2); +GKO_BIND_CUBLAS_NORM2(std::complex, cublasScnrm2); +GKO_BIND_CUBLAS_NORM2(std::complex, cublasDznrm2); template GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented); diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index e9da6b9952b..ed9f043f9ef 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45,87 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace solver { - - -#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) - - -struct SolveStruct { - int algorithm; - csrsm2Info_t solve_info; - cusparseSolvePolicy_t policy; - cusparseMatDescr_t factor_descr; - size_t factor_work_size; - void *factor_work_vec; - SolveStruct() - { - factor_work_vec = nullptr; - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&solve_info)); - algorithm = 0; - policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; - } - SolveStruct(const SolveStruct &) : SolveStruct() {} - SolveStruct(SolveStruct &&) : SolveStruct() {} - SolveStruct &operator=(const SolveStruct &) { return *this; } - SolveStruct &operator=(SolveStruct &&) { return *this; } - ~SolveStruct() - { - cusparseDestroyMatDescr(factor_descr); - if (solve_info) { - cusparseDestroyCsrsm2Info(solve_info); - } - if (factor_work_vec != nullptr) { - cudaFree(factor_work_vec); - factor_work_vec = nullptr; - } - } -}; - - -#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) - - -struct SolveStruct { - cusparseSolveAnalysisInfo_t solve_info; - cusparseMatDescr_t factor_descr; - SolveStruct() - { - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseCreateSolveAnalysisInfo(&solve_info)); - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); - } - SolveStruct(const SolveStruct &) : SolveStruct() {} - SolveStruct(SolveStruct &&) : SolveStruct() {} - SolveStruct &operator=(const SolveStruct &) { return *this; } - SolveStruct &operator=(SolveStruct &&) { return *this; } - ~SolveStruct() - { - cusparseDestroyMatDescr(factor_descr); - cusparseDestroySolveAnalysisInfo(solve_info); - } -}; - - -#endif - - -} // namespace solver - - namespace kernels { namespace cuda { /** @@ -417,6 +336,129 @@ GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented); #undef GKO_BIND_CUSPARSE32_SPMV +template +void spgemm_buffer_size( + cusparseHandle_t handle, IndexType m, IndexType n, IndexType k, + const ValueType *alpha, const cusparseMatDescr_t descrA, IndexType nnzA, + const IndexType *csrRowPtrA, const IndexType *csrColIndA, + const cusparseMatDescr_t descrB, IndexType nnzB, + const IndexType *csrRowPtrB, const IndexType *csrColIndB, + const ValueType *beta, const cusparseMatDescr_t descrD, IndexType nnzD, + const IndexType *csrRowPtrD, const IndexType *csrColIndD, + csrgemm2Info_t info, size_type &result) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(ValueType, CusparseName) \ + template <> \ + inline void spgemm_buffer_size( \ + cusparseHandle_t handle, int32 m, int32 n, int32 k, \ + const ValueType *alpha, const cusparseMatDescr_t descrA, int32 nnzA, \ + const int32 *csrRowPtrA, const int32 *csrColIndA, \ + const cusparseMatDescr_t descrB, int32 nnzB, const int32 *csrRowPtrB, \ + const int32 *csrColIndB, const ValueType *beta, \ + const cusparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD, \ + const int32 *csrColIndD, csrgemm2Info_t info, size_type &result) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, m, n, k, as_culibs_type(alpha), descrA, nnzA, \ + csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB, \ + csrColIndB, as_culibs_type(beta), descrD, nnzD, \ + csrRowPtrD, csrColIndD, info, &result)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(float, cusparseScsrgemm2_bufferSizeExt); +GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(double, cusparseDcsrgemm2_bufferSizeExt); +GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(std::complex, + cusparseCcsrgemm2_bufferSizeExt); +GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE(std::complex, + cusparseZcsrgemm2_bufferSizeExt); + + +#undef GKO_BIND_CUSPARSE_SPGEMM_BUFFER_SIZE + + +template +void spgemm_nnz(cusparseHandle_t handle, IndexType m, IndexType n, IndexType k, + const cusparseMatDescr_t descrA, IndexType nnzA, + const IndexType *csrRowPtrA, const IndexType *csrColIndA, + const cusparseMatDescr_t descrB, IndexType nnzB, + const IndexType *csrRowPtrB, const IndexType *csrColIndB, + const cusparseMatDescr_t descrD, IndexType nnzD, + const IndexType *csrRowPtrD, const IndexType *csrColIndD, + const cusparseMatDescr_t descrC, IndexType *csrRowPtrC, + IndexType *nnzC, csrgemm2Info_t info, + void *buffer) GKO_NOT_IMPLEMENTED; + +template <> +inline void spgemm_nnz( + cusparseHandle_t handle, int32 m, int32 n, int32 k, + const cusparseMatDescr_t descrA, int32 nnzA, const int32 *csrRowPtrA, + const int32 *csrColIndA, const cusparseMatDescr_t descrB, int32 nnzB, + const int32 *csrRowPtrB, const int32 *csrColIndB, + const cusparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD, + const int32 *csrColIndD, const cusparseMatDescr_t descrC, int32 *csrRowPtrC, + int32 *nnzC, csrgemm2Info_t info, void *buffer) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrgemm2Nnz( + handle, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB, + csrRowPtrB, csrColIndB, descrD, nnzD, csrRowPtrD, csrColIndD, descrC, + csrRowPtrC, nnzC, info, buffer)); +} + + +template +void spgemm(cusparseHandle_t handle, IndexType m, IndexType n, IndexType k, + const ValueType *alpha, const cusparseMatDescr_t descrA, + IndexType nnzA, const ValueType *csrValA, + const IndexType *csrRowPtrA, const IndexType *csrColIndA, + const cusparseMatDescr_t descrB, IndexType nnzB, + const ValueType *csrValB, const IndexType *csrRowPtrB, + const IndexType *csrColIndB, const ValueType *beta, + const cusparseMatDescr_t descrD, IndexType nnzD, + const ValueType *csrValD, const IndexType *csrRowPtrD, + const IndexType *csrColIndD, const cusparseMatDescr_t descrC, + ValueType *csrValC, const IndexType *csrRowPtrC, + IndexType *csrColIndC, csrgemm2Info_t info, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_SPGEMM(ValueType, CusparseName) \ + template <> \ + inline void spgemm( \ + cusparseHandle_t handle, int32 m, int32 n, int32 k, \ + const ValueType *alpha, const cusparseMatDescr_t descrA, int32 nnzA, \ + const ValueType *csrValA, const int32 *csrRowPtrA, \ + const int32 *csrColIndA, const cusparseMatDescr_t descrB, int32 nnzB, \ + const ValueType *csrValB, const int32 *csrRowPtrB, \ + const int32 *csrColIndB, const ValueType *beta, \ + const cusparseMatDescr_t descrD, int32 nnzD, const ValueType *csrValD, \ + const int32 *csrRowPtrD, const int32 *csrColIndD, \ + const cusparseMatDescr_t descrC, ValueType *csrValC, \ + const int32 *csrRowPtrC, int32 *csrColIndC, csrgemm2Info_t info, \ + void *buffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName( \ + handle, m, n, k, as_culibs_type(alpha), descrA, nnzA, \ + as_culibs_type(csrValA), csrRowPtrA, csrColIndA, descrB, nnzB, \ + as_culibs_type(csrValB), csrRowPtrB, csrColIndB, \ + as_culibs_type(beta), descrD, nnzD, as_culibs_type(csrValD), \ + csrRowPtrD, csrColIndD, descrC, as_culibs_type(csrValC), \ + csrRowPtrC, csrColIndC, info, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_SPGEMM(float, cusparseScsrgemm2); +GKO_BIND_CUSPARSE_SPGEMM(double, cusparseDcsrgemm2); +GKO_BIND_CUSPARSE_SPGEMM(std::complex, cusparseCcsrgemm2); +GKO_BIND_CUSPARSE_SPGEMM(std::complex, cusparseZcsrgemm2); + + +#undef GKO_BIND_CUSPARSE_SPGEMM + + #define GKO_BIND_CUSPARSE32_CSR2HYB(ValueType, CusparseName) \ inline void csr2hyb(cusparseHandle_t handle, int32 m, int32 n, \ const cusparseMatDescr_t descrA, \ @@ -573,6 +615,73 @@ inline void destroy(cusparseMatDescr_t descr) } +inline csrgemm2Info_t create_spgemm_info() +{ + csrgemm2Info_t info{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrgemm2Info(&info)); + return info; +} + + +inline void destroy(csrgemm2Info_t info) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrgemm2Info(info)); +} + + +// CUDA versions 9.2 and above have csrsm2. +#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) + + +inline csrsm2Info_t create_solve_info() +{ + csrsm2Info_t info{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&info)); + return info; +} + + +inline void destroy(csrsm2Info_t info) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrsm2Info(info)); +} + + +// CUDA_VERSION<=9.1 do not support csrsm2. +#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) + + +inline cusparseSolveAnalysisInfo_t create_solve_info() +{ + cusparseSolveAnalysisInfo_t info{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateSolveAnalysisInfo(&info)); + return info; +} + + +inline void destroy(cusparseSolveAnalysisInfo_t info) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySolveAnalysisInfo(info)); +} + + +#endif + + +inline csrilu02Info_t create_ilu0_info() +{ + csrilu02Info_t info{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrilu02Info(&info)); + return info; +} + + +inline void destroy(csrilu02Info_t info) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrilu02Info(info)); +} + + // CUDA versions 9.2 and above have csrsm2. #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) @@ -788,7 +897,8 @@ GKO_BIND_CUSPARSE64_CSRSM_ANALYSIS(ValueType, detail::not_implemented); size_type n, const ValueType *one, const cusparseMatDescr_t descr, \ const ValueType *csrVal, const int32 *csrRowPtr, \ const int32 *csrColInd, cusparseSolveAnalysisInfo_t factor_info, \ - ValueType *rhs, int32 rhs_stride, ValueType *sol, int32 sol_stride) \ + const ValueType *rhs, int32 rhs_stride, ValueType *sol, \ + int32 sol_stride) \ { \ GKO_ASSERT_NO_CUSPARSE_ERRORS( \ CusparseName(handle, trans, m, n, as_culibs_type(one), descr, \ @@ -806,8 +916,8 @@ GKO_BIND_CUSPARSE64_CSRSM_ANALYSIS(ValueType, detail::not_implemented); size_type n, const ValueType *one, const cusparseMatDescr_t descr, \ const ValueType *csrVal, const int64 *csrRowPtr, \ const int64 *csrColInd, cusparseSolveAnalysisInfo_t factor_info, \ - ValueType *rhs, int64 rhs_stride, ValueType *sol, int64 sol_stride) \ - GKO_NOT_IMPLEMENTED; \ + const ValueType *rhs, int64 rhs_stride, ValueType *sol, \ + int64 sol_stride) GKO_NOT_IMPLEMENTED; \ static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") @@ -831,6 +941,180 @@ GKO_BIND_CUSPARSE64_CSRSM_SOLVE(ValueType, detail::not_implemented); #endif +template +void create_identity_permutation(cusparseHandle_t handle, IndexType size, + IndexType *permutation) GKO_NOT_IMPLEMENTED; + +template <> +inline void create_identity_permutation(cusparseHandle_t handle, + int32 size, int32 *permutation) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateIdentityPermutation(handle, size, permutation)); +} + + +template +void csrsort_buffer_size(cusparseHandle_t handle, IndexType m, IndexType n, + IndexType nnz, const IndexType *row_ptrs, + const IndexType *col_idxs, + size_type &buffer_size) GKO_NOT_IMPLEMENTED; + +template <> +inline void csrsort_buffer_size(cusparseHandle_t handle, int32 m, + int32 n, int32 nnz, + const int32 *row_ptrs, + const int32 *col_idxs, + size_type &buffer_size) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrsort_bufferSizeExt( + handle, m, n, nnz, row_ptrs, col_idxs, &buffer_size)); +} + + +template +void csrsort(cusparseHandle_t handle, IndexType m, IndexType n, IndexType nnz, + const cusparseMatDescr_t descr, const IndexType *row_ptrs, + IndexType *col_idxs, IndexType *permutation, + void *buffer) GKO_NOT_IMPLEMENTED; + +template <> +inline void csrsort(cusparseHandle_t handle, int32 m, int32 n, int32 nnz, + const cusparseMatDescr_t descr, + const int32 *row_ptrs, int32 *col_idxs, + int32 *permutation, void *buffer) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseXcsrsort( + handle, m, n, nnz, descr, row_ptrs, col_idxs, permutation, buffer)); +} + + +template +void gather(cusparseHandle_t handle, IndexType nnz, const ValueType *in, + ValueType *out, const IndexType *permutation) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_GATHER(ValueType, CusparseName) \ + template <> \ + inline void gather(cusparseHandle_t handle, int32 nnz, \ + const ValueType *in, ValueType *out, \ + const int32 *permutation) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, nnz, as_culibs_type(in), as_culibs_type(out), \ + permutation, CUSPARSE_INDEX_BASE_ZERO)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_GATHER(float, cusparseSgthr); +GKO_BIND_CUSPARSE_GATHER(double, cusparseDgthr); +GKO_BIND_CUSPARSE_GATHER(std::complex, cusparseCgthr); +GKO_BIND_CUSPARSE_GATHER(std::complex, cusparseZgthr); + +#undef GKO_BIND_CUSPARSE_GATHER + + +template +void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz, + const cusparseMatDescr_t descr, const ValueType *vals, + const IndexType *row_ptrs, const IndexType *col_idxs, + csrilu02Info_t info, + size_type &buffer_size) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(ValueType, CusparseName) \ + template <> \ + inline void ilu0_buffer_size( \ + cusparseHandle_t handle, int32 m, int32 nnz, \ + const cusparseMatDescr_t descr, const ValueType *vals, \ + const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info, \ + size_type &buffer_size) \ + { \ + int tmp_buffer_size{}; \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, m, nnz, descr, \ + as_culibs_type(const_cast(vals)), \ + row_ptrs, col_idxs, info, &tmp_buffer_size)); \ + buffer_size = tmp_buffer_size; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(float, cusparseScsrilu02_bufferSize); +GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(double, cusparseDcsrilu02_bufferSize); +GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(std::complex, + cusparseCcsrilu02_bufferSize); +GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE(std::complex, + cusparseZcsrilu02_bufferSize); + +#undef GKO_BIND_CUSPARSE_ILU0_BUFFER_SIZE + + +template +void ilu0_analysis(cusparseHandle_t handle, IndexType m, IndexType nnz, + const cusparseMatDescr_t descr, const ValueType *vals, + const IndexType *row_ptrs, const IndexType *col_idxs, + csrilu02Info_t info, cusparseSolvePolicy_t policy, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_ILU0_ANALYSIS(ValueType, CusparseName) \ + template <> \ + inline void ilu0_analysis( \ + cusparseHandle_t handle, int32 m, int32 nnz, \ + const cusparseMatDescr_t descr, const ValueType *vals, \ + const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info, \ + cusparseSolvePolicy_t policy, void *buffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, m, nnz, descr, as_culibs_type(vals), \ + row_ptrs, col_idxs, info, policy, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_ILU0_ANALYSIS(float, cusparseScsrilu02_analysis); +GKO_BIND_CUSPARSE_ILU0_ANALYSIS(double, cusparseDcsrilu02_analysis); +GKO_BIND_CUSPARSE_ILU0_ANALYSIS(std::complex, + cusparseCcsrilu02_analysis); +GKO_BIND_CUSPARSE_ILU0_ANALYSIS(std::complex, + cusparseZcsrilu02_analysis); + +#undef GKO_BIND_CUSPARSE_ILU0_ANALYSIS + + +template +void ilu0(cusparseHandle_t handle, IndexType m, IndexType nnz, + const cusparseMatDescr_t descr, ValueType *vals, + const IndexType *row_ptrs, const IndexType *col_idxs, + csrilu02Info_t info, cusparseSolvePolicy_t policy, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_ILU0(ValueType, CusparseName) \ + template <> \ + inline void ilu0( \ + cusparseHandle_t handle, int32 m, int32 nnz, \ + const cusparseMatDescr_t descr, ValueType *vals, \ + const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info, \ + cusparseSolvePolicy_t policy, void *buffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, m, nnz, descr, as_culibs_type(vals), \ + row_ptrs, col_idxs, info, policy, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_ILU0(float, cusparseScsrilu02); +GKO_BIND_CUSPARSE_ILU0(double, cusparseDcsrilu02); +GKO_BIND_CUSPARSE_ILU0(std::complex, cusparseCcsrilu02); +GKO_BIND_CUSPARSE_ILU0(std::complex, cusparseZcsrilu02); + +#undef GKO_BIND_CUSPARSE_ILU0 + + } // namespace cusparse } // namespace cuda } // namespace kernels diff --git a/cuda/base/device_guard.hpp b/cuda/base/device_guard.hpp index 7cda48593d0..aa347994327 100644 --- a/cuda/base/device_guard.hpp +++ b/cuda/base/device_guard.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace cuda { /** @@ -85,6 +86,7 @@ class device_guard { }; +} // namespace cuda } // namespace gko diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp index a781867cc27..93fcd5e7cfd 100644 --- a/cuda/base/exception.cpp +++ b/cuda/base/exception.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,11 +33,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include #include #include +#include + + namespace gko { @@ -67,6 +73,8 @@ std::string CublasError::get_error(int64 error_code) GKO_REGISTER_CUBLAS_ERROR(CUBLAS_STATUS_NOT_SUPPORTED); GKO_REGISTER_CUBLAS_ERROR(CUBLAS_STATUS_LICENSE_ERROR); return "Unknown error"; + +#undef GKO_REGISTER_CUBLAS_ERROR } @@ -86,6 +94,8 @@ std::string CusparseError::get_error(int64 error_code) GKO_REGISTER_CUSPARSE_ERROR(CUSPARSE_STATUS_INTERNAL_ERROR); GKO_REGISTER_CUSPARSE_ERROR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); return "Unknown error"; + +#undef GKO_REGISTER_CUSPARSE_ERROR } diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 3faa5e7390f..a72ecef7591 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,76 +43,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/device_guard.hpp" namespace gko { -namespace { -// The function is copied from _ConvertSMVer2Cores of -// cuda-9.2/samples/common/inc/helper_cuda.h -inline int convert_sm_ver_to_cores(int major, int minor) -{ - // Defines for GPU Architecture types (using the SM version to determine - // the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, - // and m = SM minor version - int Cores; - } sSMtoCores; - - sSMtoCores nGpuArchCoresPerSM[] = { - {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class - {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class - {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class - {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class - {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class - {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class - {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class - {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class - {0x70, 64}, // Volta Generation (SM 7.0) GV100 class - {0x72, 64}, // Volta Generation (SM 7.2) GV11b class - {0x75, 64}, // Turing Generation (SM 7.5) TU1xx class - {-1, -1}}; - - int index = 0; - - while (nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { - return nGpuArchCoresPerSM[index].Cores; - } - index++; - } - -#if GKO_VERBOSE_LEVEL >= 1 - // If we don't find the values, we use the last valid value by default - // to allow proper execution - std::cerr << "MapSMtoCores for SM " << major << "." << minor - << "is undefined. The default value of " - << nGpuArchCoresPerSM[index - 1].Cores << " Cores/SM is used." - << std::endl; -#endif - return nGpuArchCoresPerSM[index - 1].Cores; -} - - -} // namespace +#include "common/base/executor.hpp.inc" std::shared_ptr CudaExecutor::create( - int device_id, std::shared_ptr master) + int device_id, std::shared_ptr master, bool device_reset) { return std::shared_ptr( - new CudaExecutor(device_id, std::move(master)), + new CudaExecutor(device_id, std::move(master), device_reset), [device_id](CudaExecutor *exec) { delete exec; - if (!CudaExecutor::get_num_execs(device_id)) { - device_guard g(device_id); + if (!CudaExecutor::get_num_execs(device_id) && + exec->get_device_reset()) { + cuda::device_guard g(device_id); cudaDeviceReset(); } }); @@ -122,15 +74,17 @@ std::shared_ptr CudaExecutor::create( void OmpExecutor::raw_copy_to(const CudaExecutor *dest, size_type num_bytes, const void *src_ptr, void *dest_ptr) const { - device_guard g(dest->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS( - cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyHostToDevice)); + if (num_bytes > 0) { + cuda::device_guard g(dest->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS( + cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyHostToDevice)); + } } void CudaExecutor::raw_free(void *ptr) const noexcept { - device_guard g(this->get_device_id()); + cuda::device_guard g(this->get_device_id()); auto error_code = cudaFree(ptr); if (error_code != cudaSuccess) { #if GKO_VERBOSE_LEVEL >= 1 @@ -148,7 +102,7 @@ void CudaExecutor::raw_free(void *ptr) const noexcept void *CudaExecutor::raw_alloc(size_type num_bytes) const { void *dev_ptr = nullptr; - device_guard g(this->get_device_id()); + cuda::device_guard g(this->get_device_id()); auto error_code = cudaMalloc(&dev_ptr, num_bytes); if (error_code != cudaErrorMemoryAllocation) { GKO_ASSERT_NO_CUDA_ERRORS(error_code); @@ -161,24 +115,45 @@ void *CudaExecutor::raw_alloc(size_type num_bytes) const void CudaExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes, const void *src_ptr, void *dest_ptr) const { - device_guard g(this->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS( - cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyDeviceToHost)); + if (num_bytes > 0) { + cuda::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS( + cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyDeviceToHost)); + } } void CudaExecutor::raw_copy_to(const CudaExecutor *src, size_type num_bytes, const void *src_ptr, void *dest_ptr) const { - device_guard g(this->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeer( - dest_ptr, this->device_id_, src_ptr, src->get_device_id(), num_bytes)); + if (num_bytes > 0) { + cuda::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeer(dest_ptr, this->device_id_, + src_ptr, src->get_device_id(), + num_bytes)); + } +} + + +void CudaExecutor::raw_copy_to(const HipExecutor *src, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const +{ +#if GINKGO_HIP_PLATFORM_NVCC == 1 + if (num_bytes > 0) { + cuda::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeer(dest_ptr, this->device_id_, + src_ptr, src->get_device_id(), + num_bytes)); + } +#else + GKO_NOT_SUPPORTED(this); +#endif } void CudaExecutor::synchronize() const { - device_guard g(this->get_device_id()); + cuda::device_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceSynchronize()); } @@ -186,7 +161,7 @@ void CudaExecutor::synchronize() const void CudaExecutor::run(const Operation &op) const { this->template log(this, &op); - device_guard g(this->get_device_id()); + cuda::device_guard g(this->get_device_id()); op.run( std::static_pointer_cast(this->shared_from_this())); this->template log(this, &op); @@ -208,14 +183,16 @@ int CudaExecutor::get_num_devices() void CudaExecutor::set_gpu_property() { if (device_id_ < this->get_num_devices() && device_id_ >= 0) { - device_guard g(this->get_device_id()); + cuda::device_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute( &major_, cudaDevAttrComputeCapabilityMajor, device_id_)); GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute( &minor_, cudaDevAttrComputeCapabilityMinor, device_id_)); GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute( &num_multiprocessor_, cudaDevAttrMultiProcessorCount, device_id_)); - num_cores_per_sm_ = convert_sm_ver_to_cores(major_, minor_); + num_warps_per_sm_ = convert_sm_ver_to_cores(major_, minor_) / + kernels::cuda::config::warp_size; + warp_size_ = kernels::cuda::config::warp_size; } } @@ -224,15 +201,15 @@ void CudaExecutor::init_handles() { if (device_id_ < this->get_num_devices() && device_id_ >= 0) { const auto id = this->get_device_id(); - device_guard g(id); + cuda::device_guard g(id); this->cublas_handle_ = handle_manager( kernels::cuda::cublas::init(), [id](cublasHandle_t handle) { - device_guard g(id); + cuda::device_guard g(id); kernels::cuda::cublas::destroy(handle); }); this->cusparse_handle_ = handle_manager( kernels::cuda::cusparse::init(), [id](cusparseHandle_t handle) { - device_guard g(id); + cuda::device_guard g(id); kernels::cuda::cusparse::destroy(handle); }); } diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp index bb425214a78..7e970486a1e 100644 --- a/cuda/base/math.hpp +++ b/cuda/base/math.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -41,178 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace detail { - - -template -struct remove_complex_impl> { - using type = T; -}; - - -template -struct is_complex_impl> - : public std::integral_constant {}; - - -template -struct truncate_type_impl> { - using type = thrust::complex::type>; -}; - - -} // namespace detail - - -template <> -__device__ GKO_INLINE std::complex zero>() -{ - thrust::complex z(0); - return reinterpret_cast &>(z); -} - -template <> -__device__ GKO_INLINE std::complex zero>() -{ - thrust::complex z(0); - return reinterpret_cast &>(z); -} - -template <> -__device__ GKO_INLINE std::complex one>() -{ - thrust::complex z(1); - return reinterpret_cast &>(z); -} - -template <> -__device__ GKO_INLINE std::complex one>() -{ - thrust::complex z(1); - return reinterpret_cast &>(z); -} - - -// This first part is specific for clang and intel in combination with the nvcc -// compiler from the toolkit older than 9.2. -// Both want to use their `__builtin_isfinite` function, which is not present -// as a __device__ function, so it results in a compiler error. -// Here, `isfinite` is written by hand, which might not be as performant as the -// intrinsic function from CUDA, but it compiles and works. -#if defined(__CUDA_ARCH__) && \ - (defined(_MSC_VER) || \ - (defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && \ - (__CUDACC_VER_MAJOR__ * 1000 + __CUDACC_VER_MINOR__) < 9002 && \ - (defined(__clang__) || defined(__ICC) || defined(__ICL)))) - - -namespace detail { - - -/** - * This structure can be used to get the exponent mask of a given floating - * point type. Uses specialization to implement different types. - */ -template -struct mask_creator {}; - -template <> -struct mask_creator { - using int_type = int32; - static constexpr int_type number_exponent_bits = 8; - static constexpr int_type number_significand_bits = 23; - // integer representation of a floating point number, where all exponent - // bits are set - static constexpr int_type exponent_mask = - ((int_type{1} << number_exponent_bits) - 1) << number_significand_bits; - static __device__ int_type reinterpret_int(const float &value) - { - return __float_as_int(value); - } -}; - -template <> -struct mask_creator { - using int_type = int64; - static constexpr int_type number_exponent_bits = 11; - static constexpr int_type number_significand_bits = 52; - // integer representation of a floating point number, where all exponent - // bits are set - static constexpr int_type exponent_mask = - ((int_type{1} << number_exponent_bits) - 1) << number_significand_bits; - static __device__ int_type reinterpret_int(const double &value) - { - return __double_as_longlong(value); - } -}; - - -} // namespace detail - - -/** - * Checks if a given value is finite, meaning it is neither +/- infinity - * nor NaN. - * - * @internal It checks if all exponent bits are set. If all are set, the - * number either represents NaN or +/- infinity, meaning it is a - * non-finite number. - * - * @param value value to check - * - * returns `true` if the given value is finite, meaning it is neither - * +/- infinity nor NaN. - */ -#define GKO_DEFINE_ISFINITE_FOR_TYPE(_type) \ - GKO_INLINE __device__ bool isfinite(const _type &value) \ - { \ - constexpr auto mask = detail::mask_creator<_type>::exponent_mask; \ - const auto re_int = \ - detail::mask_creator<_type>::reinterpret_int(value); \ - return (re_int & mask) != mask; \ - } - -GKO_DEFINE_ISFINITE_FOR_TYPE(float) -GKO_DEFINE_ISFINITE_FOR_TYPE(double) -#undef GKO_DEFINE_ISFINITE_FOR_TYPE - - -/** - * Checks if all components of a complex value are finite, meaning they are - * neither +/- infinity nor NaN. - * - * @internal required for the clang compiler. This function will be used rather - * than the `isfinite` function in the public `math.hpp` because - * there is no template parameter, so it is prefered during lookup. - * - * @tparam T complex type of the value to check - * - * @param value complex value to check - * - * returns `true` if both components of the given value are finite, meaning - * they are neither +/- infinity nor NaN. - */ -#define GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE(_type) \ - GKO_INLINE __device__ bool isfinite(const _type &value) \ - { \ - return isfinite(value.real()) && isfinite(value.imag()); \ - } - -GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE(thrust::complex) -GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE(thrust::complex) -#undef GKO_DEFINE_ISFINITE_FOR_COMPLEX_TYPE - - -// For all other compiler in combination with CUDA, just use the provided -// `isfinite` function -#elif defined(__CUDA_ARCH__) - - -// If it is compiled with the CUDA compiler, use their `isfinite` -using ::isfinite; - - -#endif // defined(__CUDA_ARCH__) + + +#include "common/base/math.hpp.inc" } // namespace gko diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp index f9094d21f8e..89bd724bedf 100644 --- a/cuda/base/pointer_mode_guard.hpp +++ b/cuda/base/pointer_mode_guard.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 4cc9e304ce7..3da51bd2ac9 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -35,9 +35,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include #include +#include +#include + + namespace gko { @@ -190,6 +196,33 @@ constexpr cudaDataType_t cuda_data_type_impl() } +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ + !(defined(_WIN32) || defined(__CYGWIN__)) + + +template +constexpr cusparseIndexType_t cusparse_index_type_impl() +{ + return CUSPARSE_INDEX_16U; +} + +template <> +constexpr cusparseIndexType_t cusparse_index_type_impl() +{ + return CUSPARSE_INDEX_32I; +} + +template <> +constexpr cusparseIndexType_t cusparse_index_type_impl() +{ + return CUSPARSE_INDEX_64I; +} + + +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && + // !(defined(_WIN32) || defined(__CYGWIN__)) + + } // namespace detail @@ -208,6 +241,29 @@ constexpr cudaDataType_t cuda_data_type() } +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ + !(defined(_WIN32) || defined(__CYGWIN__)) + + +/** + * This is an alias for the `cudaIndexType_t` equivalent of `T`. By default, + * CUSPARSE_INDEX_16U is returned. + * + * @tparam T a type + * + * @returns the actual `cusparseIndexType_t` + */ +template +constexpr cusparseIndexType_t cusparse_index_type() +{ + return detail::cusparse_index_type_impl(); +} + + +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && + // !(defined(_WIN32) || defined(__CYGWIN__)) + + /** * This is an alias for CUDA's equivalent of `T`. * @@ -270,30 +326,6 @@ inline culibs_type as_culibs_type(T val) } -struct cuda_config { - /** - * The number of threads within a CUDA warp. - */ - static constexpr uint32 warp_size = 32; - - /** - * The bitmask of the entire warp. - */ - static constexpr uint32 full_lane_mask = (1ll << warp_size) - 1; - - /** - * The maximal number of threads allowed in a CUDA warp. - */ - static constexpr uint32 max_block_size = 1024; - - /** - * The minimal amount of warps that need to be scheduled for each block - * to maximize GPU occupancy. - */ - static constexpr uint32 min_warps_per_block = 4; -}; - - } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/version.cpp b/cuda/base/version.cpp index 8403ccbf50c..41785e5fc1f 100644 --- a/cuda/base/version.cpp +++ b/cuda/base/version.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh index 8031fe70b7d..7195ea85f61 100644 --- a/cuda/components/atomic.cuh +++ b/cuda/components/atomic.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,147 +34,50 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_COMPONENTS_ATOMIC_CUH_ -namespace gko { -namespace kernels { -namespace cuda { - - -namespace detail { - - -template -struct atomic_helper { - __forceinline__ __device__ static void atomic_add(ValueType *, ValueType) - { - static_assert(sizeof(ValueType) == 0, - "This default function is not implemented, only the " - "specializations are."); - // TODO: add proper implementation of generic atomic add - } -}; - - -template -__forceinline__ __device__ ResultType reinterpret(ValueType val) -{ - static_assert(sizeof(ValueType) == sizeof(ResultType), - "The type to reinterpret to must be of the same size as the " - "original type."); - return reinterpret_cast(val); -} - - -#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE) \ - template \ - struct atomic_helper> { \ - __forceinline__ __device__ static void atomic_add( \ - ValueType *__restrict__ addr, ValueType val) \ - { \ - CONVERTER_TYPE *address_as_ull = \ - reinterpret_cast(addr); \ - CONVERTER_TYPE old = *address_as_ull; \ - CONVERTER_TYPE assumed; \ - do { \ - assumed = old; \ - old = atomicCAS(address_as_ull, assumed, \ - reinterpret( \ - val + reinterpret(assumed))); \ - } while (assumed != old); \ - } \ - }; - -// Support 64-bit ATOMIC_ADD -GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); -// Support 32-bit ATOMIC_ADD -GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); - +#include -#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10100)) -// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS -GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); -#endif -#undef GKO_BIND_ATOMIC_HELPER_STRUCTURE +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" -} // namespace detail - - -template -__forceinline__ __device__ void atomic_add(T *__restrict__ addr, T val) -{ - detail::atomic_helper::atomic_add(addr, val); -} - - -#define GKO_BIND_ATOMIC_ADD(ValueType) \ - __forceinline__ __device__ void atomic_add(ValueType *__restrict__ addr, \ - ValueType val) \ - { \ - atomicAdd(addr, val); \ - } - -GKO_BIND_ATOMIC_ADD(int); -GKO_BIND_ATOMIC_ADD(unsigned int); -GKO_BIND_ATOMIC_ADD(unsigned long long int); -GKO_BIND_ATOMIC_ADD(float); - - -#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) -// CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute -// capability 6.x and higher -GKO_BIND_ATOMIC_ADD(double); -#endif - -#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) -// CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices -// of compute capability 7.x and higher. -GKO_BIND_ATOMIC_ADD(__half); -#endif +namespace gko { +namespace kernels { +namespace cuda { -#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) -// CUDA 10.0 starts supporting 32-bit __half2 floating-point atomicAdd on -// devices of compute capability 6.x and higher. note: The atomicity of the -// __half2 add operation is guaranteed separately for each of the two __half -// elements; the entire __half2 is not guaranteed to be atomic as a single -// 32-bit access. -GKO_BIND_ATOMIC_ADD(__half2); -#endif -#undef GKO_BIND_ATOMIC_ADD +#include "common/components/atomic.hpp.inc" /** * @internal * - * @note It is not 'real' complex atomic add opeartion + * @note It is not 'real' complex atomic add operation */ -__forceinline__ __device__ void atomic_add( +__forceinline__ __device__ thrust::complex atomic_add( thrust::complex *__restrict__ address, thrust::complex val) { - cuComplex *cuaddr = reinterpret_cast(address); + cuComplex *addr = reinterpret_cast(address); // Separate to real part and imag part - atomic_add(&(cuaddr->x), val.real()); - atomic_add(&(cuaddr->y), val.imag()); + auto real = atomic_add(&(addr->x), val.real()); + auto imag = atomic_add(&(addr->y), val.imag()); + return {real, imag}; } + /** * @internal * - * @note It is not 'real' complex atomic add opeartion + * @note It is not 'real' complex atomic add operation */ -__forceinline__ __device__ void atomic_add( +__forceinline__ __device__ thrust::complex atomic_add( thrust::complex *__restrict__ address, thrust::complex val) { - cuDoubleComplex *cuaddr = reinterpret_cast(address); + cuDoubleComplex *addr = reinterpret_cast(address); // Separate to real part and imag part - atomic_add(&(cuaddr->x), val.real()); - atomic_add(&(cuaddr->y), val.imag()); + auto real = atomic_add(&(addr->x), val.real()); + auto imag = atomic_add(&(addr->y), val.imag()); + return {real, imag}; } diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index af9c1e68a06..e90f15fdf44 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,7 +40,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/base/config.hpp" + + namespace gko { +namespace kernels { +namespace cuda { /** @@ -59,7 +64,7 @@ namespace gko { * A cooperative group (both from standard CUDA and from Ginkgo) is not a * specific type, but a concept. That is, any type satisfying the interface * imposed by the cooperative groups API is considered a cooperative - * group (a.k.a. "duck typing"). To maximize the generality of components than + * group (a.k.a. "duck typing"). To maximize the generality of components that * need cooperative groups, instead of creating the group manually, consider * requesting one as an input parameter. Make sure its type is a template * parameter to maximize the set of groups for which your algorithm can be @@ -228,19 +233,18 @@ public: __device__ unsigned thread_rank() const noexcept { return data_.rank; } private: + // clang-format off __device__ grid_group() - : data_{blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * - gridDim.z, - threadIdx.x + - blockDim.x * - (threadIdx.y + - blockDim.y * - (threadIdx.z + - blockDim.z * - (blockIdx.x + - gridDim.x * - (blockIdx.y + gridDim.y * blockIdx.z))))} + : data_{ + blockDim.x * blockDim.y * blockDim.z * + gridDim.x * gridDim.y * gridDim.z, + threadIdx.x + blockDim.x * + (threadIdx.y + blockDim.y * + (threadIdx.z + blockDim.z * + (blockIdx.x + gridDim.x * + (blockIdx.y + gridDim.y * blockIdx.z))))} {} + // clang-format on struct alignas(8) { unsigned size; @@ -341,7 +345,7 @@ private: template static __device__ __forceinline__ ValueType - shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType &var, + shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { static_assert(sizeof(ValueType) % sizeof(uint32) == 0, @@ -450,15 +454,23 @@ __device__ __forceinline__ auto tiled_partition(const Group &g) } +// Only support tile_partition with 1, 2, 4, 8, 16, 32. +// Reference: +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes template -__device__ __forceinline__ thread_block_tile tiled_partition( - const Group &) +__device__ __forceinline__ gko::xstd::enable_if_t< + (Size <= kernels::cuda::config::warp_size) && (Size > 0) && + (kernels::cuda::config::warp_size % Size == 0), + thread_block_tile> +tiled_partition(const Group &) { return thread_block_tile(); } } // namespace group +} // namespace cuda +} // namespace kernels } // namespace gko diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh index 482c780a9f3..ca2dacbbdef 100644 --- a/cuda/components/diagonal_block_manipulation.cuh +++ b/cuda/components/diagonal_block_manipulation.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ +#include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" @@ -43,69 +44,8 @@ namespace kernels { namespace cuda { namespace csr { -/** - * @internal - * - * @note assumes that block dimensions are in "standard format": - * (subwarp_size, cuda_config::warp_size / subwarp_size, z) - */ -template < - int max_block_size, int warps_per_block, typename Group, typename ValueType, - typename IndexType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ void extract_transposed_diag_blocks( - const Group &group, int processed_blocks, - const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, - const IndexType *__restrict__ block_ptrs, size_type num_blocks, - ValueType *__restrict__ block_row, int increment, - ValueType *__restrict__ workspace) -{ - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const auto warp = group::tiled_partition(group); - auto bid = static_cast(blockIdx.x) * warps_per_block * - processed_blocks + - threadIdx.z * processed_blocks; - auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero(); - IndexType bsize = 0; -#pragma unroll - for (int b = 0; b < processed_blocks; ++b, ++bid) { - if (bid >= num_blocks) { - break; - } - bstart += bsize; - bsize = block_ptrs[bid + 1] - bstart; -#pragma unroll - for (int i = 0; i < max_block_size; ++i) { - if (i >= bsize) { - break; - } - if (threadIdx.y == b && threadIdx.x < max_block_size) { - workspace[threadIdx.x] = zero(); - } - warp.sync(); - const auto row = bstart + i; - const auto rstart = row_ptrs[row] + tid; - const auto rend = row_ptrs[row + 1]; - // use the entire warp to ensure coalesced memory access - for (auto j = rstart; j < rend; j += cuda_config::warp_size) { - const auto col = col_idxs[j] - bstart; - if (col >= bsize) { - break; - } - if (col >= 0) { - workspace[col] = values[j]; - } - } - warp.sync(); - if (threadIdx.y == b && threadIdx.x < bsize) { - block_row[i * increment] = workspace[threadIdx.x]; - } - warp.sync(); - } - } -} + +#include "common/components/diagonal_block_manipulation.hpp.inc" } // namespace csr diff --git a/cuda/components/zero_array.cu b/cuda/components/fill_array.cu similarity index 71% rename from cuda/components/zero_array.cu rename to cuda/components/fill_array.cu index 0596640c603..63344b7f94b 100644 --- a/cuda/components/zero_array.cu +++ b/cuda/components/fill_array.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,50 +30,41 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "cuda/components/zero_array.hpp" +#include "core/components/fill_array.hpp" + + +#include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { namespace cuda { +namespace components { constexpr int default_block_size = 512; -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void zero_array( - size_type n, ValueType *__restrict__ array) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - if (tidx < n) { - array[tidx] = zero(); - } -} - - -} // namespace kernel +#include "common/components/fill_array.hpp.inc" template -void zero_array(size_type n, ValueType *array) +void fill_array(std::shared_ptr exec, ValueType *array, + size_type n, ValueType val) { const dim3 block_size(default_block_size, 1, 1); const dim3 grid_size(ceildiv(n, block_size.x), 1, 1); - kernel::zero_array<<>>(n, array); + kernel::fill_array<<>>(n, as_cuda_type(array), + as_cuda_type(val)); } - -#define GKO_DECLARE_ZERO_ARRAY(_type) \ - void zero_array<_type>(size_type n, _type * array); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_ZERO_ARRAY); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_ZERO_ARRAY); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type); +} // namespace components } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh index 557c8d70f8d..28206769f3e 100644 --- a/cuda/components/format_conversion.cuh +++ b/cuda/components/format_conversion.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_COMPONENTS_FORMAT_CONVERSION_CUH_ +#include #include @@ -89,24 +90,24 @@ namespace host_kernel { /** * @internal * - * It calculates the number of warps used in Coo Spmv by GPU architecture and - * the number of stored elements. + * It calculates the number of warps used in Coo Spmv depending on the GPU + * architecture and the number of stored elements. */ -template +template __host__ size_type calculate_nwarps(std::shared_ptr exec, const size_type nnz) { - size_type warps_per_sm = exec->get_num_cores_per_sm() / subwarp_size; + size_type warps_per_sm = + exec->get_num_warps_per_sm() * config::warp_size / subwarp_size; size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm; size_type multiple = 8; - if (nnz >= 2000000) { + if (nnz >= 2e6) { multiple = 128; - } else if (nnz >= 200000) { + } else if (nnz >= 2e5) { multiple = 32; } - return std::min( - multiple * nwarps_in_cuda, - static_cast(ceildiv(nnz, cuda_config::warp_size))); + return std::min(multiple * nwarps_in_cuda, + size_type(ceildiv(nnz, config::warp_size))); } diff --git a/cuda/components/intrinsics.cuh b/cuda/components/intrinsics.cuh new file mode 100644 index 00000000000..7726062cfa7 --- /dev/null +++ b/cuda/components/intrinsics.cuh @@ -0,0 +1,53 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ +#define GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { + + +#include "common/components/intrinsics.hpp.inc" + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ diff --git a/cuda/components/zero_array.hpp b/cuda/components/merging.cuh similarity index 78% rename from cuda/components/zero_array.hpp rename to cuda/components/merging.cuh index a4757a49082..80b300a4daf 100644 --- a/cuda/components/zero_array.hpp +++ b/cuda/components/merging.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,13 +30,14 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - -#ifndef GKO_CUDA_COMPONENTS_ZERO_ARRAY_HPP_ -#define GKO_CUDA_COMPONENTS_ZERO_ARRAY_HPP_ +#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_ +#define GKO_CUDA_COMPONENTS_MERGING_CUH_ +#include "core/base/utils.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/searching.cuh" namespace gko { @@ -44,16 +45,7 @@ namespace kernels { namespace cuda { -/** - * Zeroes an array allocated on a CUDA device. - * - * @tparam ValueType the type of the array's elements - * - * @param n the size of the array - * @param array the array to fill with zeros - **/ -template -void zero_array(size_type n, ValueType *array); +#include "common/components/merging.hpp.inc" } // namespace cuda @@ -61,4 +53,4 @@ void zero_array(size_type n, ValueType *array); } // namespace gko -#endif // GKO_CUDA_COMPONENTS_ZERO_ARRAY_HPP_ +#endif // GKO_CUDA_COMPONENTS_MERGING_CUH_ diff --git a/cuda/components/precision_conversion.cu b/cuda/components/precision_conversion.cu new file mode 100644 index 00000000000..f98ef2cba32 --- /dev/null +++ b/cuda/components/precision_conversion.cu @@ -0,0 +1,67 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/precision_conversion.hpp" + + +#include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace components { + + +constexpr int default_block_size = 512; + + +#include "common/components/precision_conversion.hpp.inc" + + +template +void convert_precision(std::shared_ptr exec, + size_type size, const SourceType *in, TargetType *out) +{ + auto num_blocks = ceildiv(size, default_block_size); + convert_precision<<>>( + size, as_cuda_type(in), as_cuda_type(out)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); + + +} // namespace components +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu new file mode 100644 index 00000000000..ba4767a2547 --- /dev/null +++ b/cuda/components/prefix_sum.cu @@ -0,0 +1,72 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include "cuda/components/prefix_sum.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace components { + + +constexpr int prefix_sum_block_size = 512; + + +template +void prefix_sum(std::shared_ptr exec, IndexType *counts, + size_type num_entries) +{ + auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); + Array block_sum_array(exec, num_blocks); + auto block_sums = block_sum_array.get_data(); + start_prefix_sum + <<>>(num_entries, counts, + block_sums); + finalize_prefix_sum + <<>>(num_entries, counts, + block_sums); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); + +// instantiate for size_type as well, as this is used in the Sellp format +template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); + + +} // namespace components +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh index 8ce31f32a93..6a0cf5344c5 100644 --- a/cuda/components/prefix_sum.cuh +++ b/cuda/components/prefix_sum.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -48,99 +48,7 @@ namespace kernels { namespace cuda { -/** - * @internal - * First step of the calculation of a prefix sum. Calculates the prefix sum - * in-place on parts of the array `elements`. - * - * @param block_size thread block size for this kernel, also size of blocks on - * which this kernel calculates the prefix sum in-place - * @param elements array on which the prefix sum is to be calculated - * @param block_sum array which stores the total sum of each block, requires at - * least `ceildiv(num_elements, block_size)` elements - * @param num_elements total number of entries in `elements` - * - * @note To calculate the prefix sum over an array of size bigger than - * `block_size`, `finalize_prefix_sum` has to be used as well. - */ -template -__global__ __launch_bounds__(block_size) void start_prefix_sum( - size_type num_elements, ValueType *__restrict__ elements, - ValueType *__restrict__ block_sum) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - const auto element_id = threadIdx.x; - __shared__ size_type prefix_helper[block_size]; - prefix_helper[element_id] = - (tidx < num_elements) ? elements[tidx] : zero(); - auto this_block = group::this_thread_block(); - this_block.sync(); - - // Do a normal reduction -#pragma unroll - for (int i = 1; i < block_size; i <<= 1) { - const auto ai = i * (2 * element_id + 1) - 1; - const auto bi = i * (2 * element_id + 2) - 1; - if (bi < block_size) { - prefix_helper[bi] += prefix_helper[ai]; - } - this_block.sync(); - } - - if (element_id == 0) { - // Store the total sum - block_sum[blockIdx.x] = prefix_helper[block_size - 1]; - prefix_helper[block_size - 1] = zero(); - } - - this_block.sync(); - - // Perform the down-sweep phase to get the true prefix sum -#pragma unroll - for (int i = block_size >> 1; i > 0; i >>= 1) { - const auto ai = i * (2 * element_id + 1) - 1; - const auto bi = i * (2 * element_id + 2) - 1; - if (bi < block_size) { - auto tmp = prefix_helper[ai]; - prefix_helper[ai] = prefix_helper[bi]; - prefix_helper[bi] += tmp; - } - this_block.sync(); - } - if (tidx < num_elements) { - elements[tidx] = prefix_helper[element_id]; - } -} - - -/** - * @internal - * Second step of the calculation of a prefix sum. Increases the value of each - * entry of `elements` by the total sum of all preceding blocks. - * - * @param block_size thread block size for this kernel, has to be the same as - * for `start_prefix_sum` - * @param elements array on which the prefix sum is to be calculated - * @param block_sum array storing the total sum of each block - * @param num_elements total number of entries in `elements` - * - * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. - */ -template -__global__ __launch_bounds__(block_size) void finalize_prefix_sum( - size_type num_elements, ValueType *__restrict__ elements, - const ValueType *__restrict__ block_sum) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - - if (tidx < num_elements) { - ValueType prefix_block_sum = zero(); - for (size_type i = 0; i < blockIdx.x; i++) { - prefix_block_sum += block_sum[i]; - } - elements[tidx] += prefix_block_sum; - } -} +#include "common/components/prefix_sum.hpp.inc" } // namespace cuda diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index 839c90afed8..fd3522e6d99 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" @@ -53,152 +54,7 @@ namespace cuda { constexpr int default_block_size = 512; -/** - * @internal - * - * Computes a reduction using the binary operation `reduce_op` on a group - * `group`. Each thread contributes with one element `local_data`. The local - * thread element is always passed as the first parameter to the `reduce_op`. - * The function returns the result of the reduction on all threads. - * - * @note The function is guarantied to return the correct value on all threads - * only if `reduce_op` is commutative (in addition to being associative). - * Otherwise, the correct value is returned only to the thread with - * subwarp index 0. - */ -template < - typename Group, typename ValueType, typename Operator, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ ValueType reduce(const Group &group, - ValueType local_data, - Operator reduce_op = Operator{}) -{ -#pragma unroll - for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) { - const auto remote_data = group.shfl_xor(local_data, bitmask); - local_data = reduce_op(local_data, remote_data); - } - return local_data; -} - - -/** - * @internal - * - * Returns the index of the thread that has the element with the largest - * magnitude among all the threads in the group. - * Only the values from threads which set `is_pivoted` to `false` will be - * considered. - */ -template < - typename Group, typename ValueType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ int choose_pivot(const Group &group, - ValueType local_data, - bool is_pivoted) -{ - using real = remove_complex; - real lmag = is_pivoted ? -one() : abs(local_data); - const auto pivot = - reduce(group, group.thread_rank(), [&](int lidx, int ridx) { - const auto rmag = group.shfl(lmag, ridx); - if (rmag > lmag) { - lmag = rmag; - lidx = ridx; - } - return lidx; - }); - // pivot operator not commutative, make sure everyone has the same pivot - return group.shfl(pivot, 0); -} - - -/** - * @internal - * - * Computes a reduction using the binary operation `reduce_op` on entire block. - * The data for the reduction is taken from the `data` array which has to be of - * size `block_size` and accessible from all threads. The `data` array is also - * used as work space (so its content will be destroyed in the process), as well - * as to store the return value - which is stored in the 0-th position of the - * array. - */ -template < - typename Group, typename ValueType, typename Operator, - typename = xstd::enable_if_t::value>> -__device__ void reduce(const Group &__restrict__ group, - ValueType *__restrict__ data, - Operator reduce_op = Operator{}) -{ - const auto local_id = group.thread_rank(); - -#pragma unroll - for (int k = group.size() / 2; k >= cuda_config::warp_size; k /= 2) { - group.sync(); - if (local_id < k) { - data[local_id] = reduce_op(data[local_id], data[local_id + k]); - } - } - - const auto warp = group::tiled_partition(group); - const auto warp_id = group.thread_rank() / warp.size(); - if (warp_id > 0) { - return; - } - auto result = reduce(warp, data[warp.thread_rank()], reduce_op); - if (warp.thread_rank() == 0) { - data[0] = result; - } -} - - -/** - * @internal - * - * Computes a reduction using the binary operation `reduce_op` on an array - * `source` of any size. Has to be called a second time on `result` to reduce - * an array larger than `block_size`. - */ -template -__device__ void reduce_array(size_type size, - const ValueType *__restrict__ source, - ValueType *__restrict__ result, - Operator reduce_op = Operator{}) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - auto thread_result = zero(); - for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) { - thread_result = reduce_op(thread_result, source[i]); - } - result[threadIdx.x] = thread_result; - - group::this_thread_block().sync(); - - // Stores the result of the reduction inside `result[0]` - reduce(group::this_thread_block(), result, reduce_op); -} - - -/** - * @internal - * - * Computes a reduction using the add operation (+) on an array - * `source` of any size. Has to be called a second time on `result` to reduce - * an array larger than `default_block_size`. - */ -template -__global__ __launch_bounds__(default_block_size) void reduce_add_array( - size_type size, const ValueType *__restrict__ source, - ValueType *__restrict__ result) -{ - __shared__ UninitializedArray block_sum; - reduce_array(size, source, static_cast(block_sum), - [](const ValueType &x, const ValueType &y) { return x + y; }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_sum[0]; - } -} +#include "common/components/reduction.hpp.inc" /** @@ -233,9 +89,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, reduce_add_array<<<1, default_block_size>>>( grid_dim, as_cuda_type(block_results_val), as_cuda_type(d_result.get_data())); - ValueType answer = zero(); - exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(), - &answer); + auto answer = exec->copy_val_to_host(d_result.get_const_data()); return answer; } diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh new file mode 100644 index 00000000000..186123e04f3 --- /dev/null +++ b/cuda/components/searching.cuh @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_ +#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_ + + +#include "cuda/base/config.hpp" +#include "cuda/components/intrinsics.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +#include "common/components/searching.hpp.inc" + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_COMPONENTS_SEARCHING_CUH_ diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh index a0f87e4e555..37f5127da06 100644 --- a/cuda/components/segment_scan.cuh +++ b/cuda/components/segment_scan.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { @@ -46,36 +45,7 @@ namespace kernels { namespace cuda { -/** - * @internal - * - * Compute a segement scan using add operation (+) of a subwarp. Each segment - * performs suffix sum. Works on the source array and returns whether the thread - * is the first element of its segment with same `ind`. - */ -template -__device__ __forceinline__ bool segment_scan( - const group::thread_block_tile &group, const IndexType ind, - ValueType *__restrict__ val) -{ - bool head = true; -#pragma unroll - for (int i = 1; i < subwarp_size; i <<= 1) { - const IndexType add_ind = group.shfl_up(ind, i); - ValueType add_val = zero(); - if (add_ind == ind && threadIdx.x >= i) { - add_val = *val; - if (i == 1) { - head = false; - } - } - add_val = group.shfl_down(add_val, i); - if (threadIdx.x < subwarp_size - i) { - *val += add_val; - } - } - return head; -} +#include "common/components/segment_scan.hpp.inc" } // namespace cuda diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh new file mode 100644 index 00000000000..9a5525f7a94 --- /dev/null +++ b/cuda/components/sorting.cuh @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_SORTING_CUH_ +#define GKO_CUDA_COMPONENTS_SORTING_CUH_ + + +#include "cuda/base/config.hpp" +#include "cuda/components/cooperative_groups.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +#include "common/components/sorting.hpp.inc" + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_COMPONENTS_SORTING_CUH_ diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh index fff091f4efd..31ebe0a28a6 100644 --- a/cuda/components/thread_ids.cuh +++ b/cuda/components/thread_ids.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ +#include "cuda/base/config.hpp" + + namespace gko { namespace kernels { namespace cuda { @@ -45,169 +48,7 @@ namespace cuda { namespace thread { -/** - * @internal - * - * Returns the ID of the block group this thread belongs to. - * - * @return the ID of the block group this thread belongs to - * - * @note Assumes that grid dimensions are in standard format: - * `(block_group_size, first_grid_dimension, second grid_dimension)` - */ -__device__ __forceinline__ size_type get_block_group_id() -{ - return static_cast(blockIdx.z) * gridDim.y + blockIdx.y; -} - -/** - * @internal - * - * Returns the ID of the block this thread belongs to. - * - * @return the ID of the block this thread belongs to - * - * @note Assumes that grid dimensions are in standard format: - * `(block_group_size, first_grid_dimension, second grid_dimension)` - */ -__device__ __forceinline__ size_type get_block_id() -{ - return get_block_group_id() * gridDim.x + blockIdx.x; -} - - -/** - * @internal - * - * Returns the local ID of the warp (relative to the block) this thread belongs - * to. - * - * @return the local ID of the warp (relative to the block) this thread belongs - * to - * - * @note Assumes that block dimensions are in standard format: - * `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size / - * cuda_config::warp_size)` - */ -__device__ __forceinline__ size_type get_local_warp_id() -{ - return static_cast(threadIdx.z); -} - - -/** - * @internal - * - * Returns the local ID of the sub-warp (relative to the block) this thread - * belongs to. - * - * @tparam subwarp_size size of the subwarp - * - * @return the local ID of the sub-warp (relative to the block) this thread - * belongs to - * - * @note Assumes that block dimensions are in standard format: - * `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size / - * cuda_config::warp_size)` - */ -template -__device__ __forceinline__ size_type get_local_subwarp_id() -{ - constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size; - return get_local_warp_id() * subwarps_per_warp + threadIdx.y; -} - - -/** - * @internal - * - * Returns the local ID of the thread (relative to the block). - * to. - * - * @tparam subwarp_size size of the subwarp - * - * @return the local ID of the thread (relative to the block) - * - * @note Assumes that block dimensions are in standard format: - * `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size / - * cuda_config::warp_size)` - */ -template -__device__ __forceinline__ size_type get_local_thread_id() -{ - return get_local_subwarp_id() * subwarp_size + threadIdx.x; -} - - -/** - * @internal - * - * Returns the global ID of the warp this thread belongs to. - * - * @tparam warps_per_block number of warps within each block - * - * @return the global ID of the warp this thread belongs to. - * - * @note Assumes that block dimensions and grid dimensions are in standard - * format: - * `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size / - * cuda_config::warp_size)` and - * `(block_group_size, first_grid_dimension, second grid_dimension)`, - * respectively. - */ -template -__device__ __forceinline__ size_type get_warp_id() -{ - return get_block_id() * warps_per_block + get_local_warp_id(); -} - - -/** - * @internal - * - * Returns the global ID of the sub-warp this thread belongs to. - * - * @tparam subwarp_size size of the subwarp - * - * @return the global ID of the sub-warp this thread belongs to. - * - * @note Assumes that block dimensions and grid dimensions are in standard - * format: - * `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size / - * cuda_config::warp_size)` and - * `(block_group_size, first_grid_dimension, second grid_dimension)`, - * respectively. - */ -template -__device__ __forceinline__ size_type get_subwarp_id() -{ - constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size; - return get_warp_id() * subwarps_per_warp + threadIdx.y; -} - - -/** - * @internal - * - * Returns the global ID of the thread. - * - * @return the global ID of the thread. - * - * @tparam subwarp_size size of the subwarp - * - * @note Assumes that block dimensions and grid dimensions are in standard - * format: - * `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size / - * cuda_config::warp_size)` and - * `(block_group_size, first_grid_dimension, second grid_dimension)`, - * respectively. - */ -template -__device__ __forceinline__ size_type get_thread_id() -{ - return get_subwarp_id() * subwarp_size + - threadIdx.x; -} +#include "common/components/thread_ids.hpp.inc" } // namespace thread diff --git a/cuda/components/uninitialized_array.hpp b/cuda/components/uninitialized_array.hpp index e1d47d9e717..b3d9096f0c9 100644 --- a/cuda/components/uninitialized_array.hpp +++ b/cuda/components/uninitialized_array.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,54 +42,7 @@ namespace kernels { namespace cuda { -template -/** - * Stores an array with uninitialized contents. - */ -class UninitializedArray { -public: - /** - * Operator for casting an UninitializedArray into its constexpr value - * pointer. - * @return the constexpr pointer to the first entry of the array. - */ - constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept - { - return &(*this)[0]; - } - - /** - * Operator for casting an UninitializedArray into its non-const value - * pointer. - * @return the non-const pointer to the first entry of the array. - */ - GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; } - - /** - * constexpr array access operator. - * @param pos The array index. Using a value outside [0, size) is undefined - * behavior. - * @return a reference to the array entry at the given index. - */ - constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept - { - return reinterpret_cast(data_)[pos]; - } - - /** - * Non-const array access operator. - * @param pos The array index. Using a value outside [0, size) is undefined - * behavior. - * @return a reference to the array entry at the given index. - */ - GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept - { - return reinterpret_cast(data_)[pos]; - } - -private: - unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; -}; +#include "common/components/uninitialized_array.hpp.inc" } // namespace cuda @@ -97,4 +50,4 @@ class UninitializedArray { } // namespace gko -#endif // GKO_CUDA_BASE_COMPONENTS_ARRAY_HPP_ +#endif // GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh index ba5906142a2..4ae18bfde18 100644 --- a/cuda/components/warp_blas.cuh +++ b/cuda/components/warp_blas.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ +#include + + #include @@ -41,374 +44,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/reduction.cuh" -#include - - namespace gko { namespace kernels { namespace cuda { -/** - * @internal - * - * Defines a postprocessing transformation that should be performed on the - * result of a function call. - * - * @note This functionality should become useless once accessors and ranges are - * in place, as they will define the storage scheme. - */ -enum postprocess_transformation { and_return, and_transpose }; - - -/** - * @internal - * - * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan - * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using - * using the thread group `group. Each thread contributes one `row` of the - * matrix, and the routine uses warp shuffles to exchange data between rows. The - * transform is performed by using the `key_row`-th row and `key_col`-th column - * of the matrix. - */ -template < - int max_problem_size, typename Group, typename ValueType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ void apply_gauss_jordan_transform( - const Group &__restrict__ group, int32 key_row, int32 key_col, - ValueType *__restrict__ row, bool &__restrict__ status) -{ - auto key_col_elem = group.shfl(row[key_col], key_row); - if (key_col_elem == zero()) { - // TODO: implement error handling for GPUs to be able to properly - // report it here - status = false; - return; - } - if (group.thread_rank() == key_row) { - key_col_elem = one() / key_col_elem; - } else { - key_col_elem = -row[key_col] / key_col_elem; - } -#pragma unroll - for (int32 i = 0; i < max_problem_size; ++i) { - const auto key_row_elem = group.shfl(row[i], key_row); - if (group.thread_rank() == key_row) { - row[i] = zero(); - } - row[i] += key_col_elem * key_row_elem; - } - row[key_col] = key_col_elem; -} - - -/** - * @internal - * - * Inverts a matrix using Gauss-Jordan elimination. The inversion is - * done in-place, so the original matrix will be overridden with the inverse. - * The inversion routine uses implicit pivoting, so the returned matrix will be - * a permuted inverse (from both sides). To obtain the correct inverse, the - * rows of the result should be permuted with $P$, and the columns with - * $ P^T $ (i.e. - * $ A^{-1} = P X P $, where $ X $ is the returned matrix). These - * permutation matrices are returned compressed as vectors `perm` and - * `trans_perm`, respectively. `i`-th value of each of the vectors is returned - * to thread of the group with rank `i`. - * - * @tparam max_problem_size the maximum problem size that will be passed to the - * inversion routine (a tighter bound results in - * faster code - * @tparam Group type of the group of threads - * @tparam ValueType type of values stored in the matrix - * - * @param group the group of threads which participate in the inversion - * @param problem_size the actual size of the matrix (cannot be larger than - * max_problem_size) - * @param row a pointer to the matrix row (i-th thread in the group should - * pass the pointer to the i-th row), has to have at least - * max_problem_size elements - * @param perm a value to hold an element of permutation matrix $ P $ - * @param trans_perm a value to hold an element of permutation matrix $ P^T - * $ - * - * @return true if the inversion succeeded, false otherwise - */ -template < - int max_problem_size, typename Group, typename ValueType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ bool invert_block(const Group &__restrict__ group, - uint32 problem_size, - ValueType *__restrict__ row, - uint32 &__restrict__ perm, - uint32 &__restrict__ trans_perm) -{ - GKO_ASSERT(problem_size <= max_problem_size); - // prevent rows after problem_size to become pivots - auto pivoted = group.thread_rank() >= problem_size; - auto status = true; -#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS -#pragma unroll -#else -#pragma unroll 1 -#endif - for (int32 i = 0; i < max_problem_size; ++i) { - if (i >= problem_size) { - break; - } - const auto piv = choose_pivot(group, row[i], pivoted); - if (group.thread_rank() == piv) { - perm = i; - pivoted = true; - } - if (group.thread_rank() == i) { - trans_perm = piv; - } - apply_gauss_jordan_transform(group, piv, i, row, - status); - } - return status; -} - - -/** - * @internal - * - * Performs the correct index calculation for the given postprocess operation. - */ -template -__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col, - T3 stride) -> - typename std::enable_if< - mod != and_transpose, - typename std::decay::type>::type -{ - return row * stride + col; -} - - -template -__host__ __device__ __forceinline__ auto get_row_major_index(T1 row, T2 col, - T3 stride) -> - typename std::enable_if< - mod == and_transpose, - typename std::decay::type>::type -{ - return col * stride + row; -} - - -/** - * @internal - * - * Copies a matrix stored as a collection of rows in different threads of the - * warp in a block of memory accessible by all threads in row-major order. - * Optionally permutes rows and columns of the matrix in the process. - * - * @tparam max_problem_size maximum problem size passed to the routine - * @tparam mod the transformation to perform on the return data - * @tparam Group type of the group of threads - * @tparam SourceValueType type of values stored in the source matrix - * @tparam ResultValueType type of values stored in the result matrix - * - * @param group group of threads participating in the copy - * @param problem_size actual size of the matrix - * (`problem_size <= max_problem_size`) - * @param source_row pointer to memory used to store a row of the source matrix - * `i`-th thread of the sub-warp should pass in the `i`-th - * row of the matrix - * @param increment offset between two consecutive elements of the row - * @param row_perm permutation vector to apply on the rows of the matrix - * (thread `i` supplies the `i`-th value of the vector) - * @param col_perm permutation vector to apply on the column of the matrix - * (thread `i` supplies the `i`-th value of the vector) - * @param destination pointer to memory where the result will be stored - * (all threads supply the same value) - * @param stride offset between two consecutive rows of the matrix - */ -template < - int max_problem_size, postprocess_transformation mod = and_return, - typename Group, typename SourceValueType, typename ResultValueType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ void copy_matrix( - const Group &__restrict__ group, uint32 problem_size, - const SourceValueType *__restrict__ source_row, uint32 increment, - uint32 row_perm, uint32 col_perm, ResultValueType *__restrict__ destination, - size_type stride) -{ - GKO_ASSERT(problem_size <= max_problem_size); -#pragma unroll - for (int32 i = 0; i < max_problem_size; ++i) { - if (i >= problem_size) { - break; - } - const auto idx = group.shfl(col_perm, i); - if (group.thread_rank() < problem_size) { - destination[get_row_major_index(idx, row_perm, stride)] = - static_cast(source_row[i * increment]); - } - } -} - - -/** - * @internal - * - * Multiplies a transposed vector and a matrix stored in column-major order. - * - * In mathematical terms, performs the operation $ res^T = vec^T \cdot mtx$. - * - * @tparam max_problem_size maximum problem size passed to the routine - * @tparam Group type of the group of threads - * @tparam MatrixValueType type of values stored in the matrix - * @tparam VectorValueType type of values stored in the vectors - * - * @param group group of threads participating in the operation - * @param problem_size actual size of the matrix - * (`problem_size <= max_problem_size`) - * @param vec input vector to multiply (thread `i` supplies the `i`-th value of - * the vector) - * @param mtx_row pointer to memory used to store a row of the input matrix, - * `i`-th thread of the sub-warp should pass in the - * `i`-th row of the matrix - * @param mtx_increment offset between two consecutive elements of the row - * @param res pointer to a block of memory where the result will be written - * (only thread 0 of the group has to supply a valid value) - * @param mtx_increment offset between two consecutive elements of the result - */ -template < - int max_problem_size, typename Group, typename MatrixValueType, - typename VectorValueType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ void multiply_transposed_vec( - const Group &__restrict__ group, uint32 problem_size, - const VectorValueType &__restrict__ vec, - const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment, - VectorValueType *__restrict__ res, uint32 res_increment) -{ - GKO_ASSERT(problem_size <= max_problem_size); - auto mtx_elem = zero(); -#pragma unroll - for (int32 i = 0; i < max_problem_size; ++i) { - if (i >= problem_size) { - break; - } - if (group.thread_rank() < problem_size) { - mtx_elem = static_cast(mtx_row[i * mtx_increment]); - } - const auto out = - reduce(group, mtx_elem * vec, - [](VectorValueType x, VectorValueType y) { return x + y; }); - if (group.thread_rank() == 0) { - res[i * res_increment] = out; - } - } -} - - -/** - * @internal - * - * Multiplies a matrix and a vector stored in column-major order. - * - * In mathematical terms, performs the operation $res = mtx \cdot vec$. - * - * @tparam max_problem_size maximum problem size passed to the routine - * @tparam Group type of the group of threads - * @tparam MatrixValueType type of values stored in the matrix - * @tparam VectorValueType type of values stored in the vectors - * @tparam Closure type of the function used to write the result - * - * @param group group of threads participating in the operation - * @param problem_size actual size of the matrix - * (`problem_size <= max_problem_size`) - * @param vec input vector to multiply (thread `i` supplies the `i`-th value of - * the vector) - * @param mtx_row pointer to memory used to store a row of the input matrix, - * `i`-th thread of the sub-warp should pass in the - * `i`-th row of the matrix - * @param mtx_increment offset between two consecutive elements of the row - * @param res pointer to a block of memory where the result will be written - * (only thread 0 of the group has to supply a valid value) - * @param mtx_increment offset between two consecutive elements of the result - * @param closure_op Operation that is performed when writing to - `res[group.thread_rank() * res_increment]` as - `closure_op(res[group.thread_rank() * res_increment], out)` - where `out` is the result of $mtx \cdot vec$. - */ -template < - int max_problem_size, typename Group, typename MatrixValueType, - typename VectorValueType, typename Closure, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ void multiply_vec( - const Group &__restrict__ group, uint32 problem_size, - const VectorValueType &__restrict__ vec, - const MatrixValueType *__restrict__ mtx_row, uint32 mtx_increment, - VectorValueType *__restrict__ res, uint32 res_increment, Closure closure_op) -{ - GKO_ASSERT(problem_size <= max_problem_size); - auto mtx_elem = zero(); - auto out = zero(); -#pragma unroll - for (int32 i = 0; i < max_problem_size; ++i) { - if (i >= problem_size) { - break; - } - if (group.thread_rank() < problem_size) { - mtx_elem = static_cast(mtx_row[i * mtx_increment]); - } - out += mtx_elem * group.shfl(vec, i); - } - if (group.thread_rank() < problem_size) { - closure_op(res[group.thread_rank() * res_increment], out); - } -} - - -/** - * @internal - * - * Computes the infinity norm of a matrix. Each thread in the group supplies - * one row of the matrix. - * - * @tparam max_problem_size maximum problem size passed to the routine - * @tparam Group type of the group of threads - * @tparam ValueType type of values stored in the matrix - * - * @param group group of threads participating in the operation - * @param num_rows number of rows of the matrix - * (`num_rows <= max_problem_size`) - * @param num_cols number of columns of the matrix - * @param row pointer to memory used to store a row of the input matrix, - * `i`-th thread of the group should pass in the `i`-th row of the - * matrix - * - * @return the infinity norm of the matrix - */ -template < - int max_problem_size, typename Group, typename ValueType, - typename = xstd::enable_if_t::value>> -__device__ __forceinline__ remove_complex compute_infinity_norm( - const Group &group, uint32 num_rows, uint32 num_cols, const ValueType *row) -{ - using result_type = remove_complex; - auto sum = zero(); - if (group.thread_rank() < num_rows) { -#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS -#pragma unroll -#else -#pragma unroll 1 -#endif - for (uint32 i = 0; i < max_problem_size; ++i) { - if (i >= num_cols) { - break; - } - sum += abs(row[i]); - } - } - return reduce(group, sum, - [](result_type x, result_type y) { return max(x, y); }); -} +#include "common/components/warp_blas.hpp.inc" } // namespace cuda diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu new file mode 100644 index 00000000000..6f5f6b4ee05 --- /dev/null +++ b/cuda/factorization/factorization_kernels.cu @@ -0,0 +1,252 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/factorization_kernels.hpp" + + +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/searching.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +constexpr int default_block_size{512}; + + +#include "common/factorization/factorization_kernels.hpp.inc" + + +template +void add_diagonal_elements(std::shared_ptr exec, + matrix::Csr *mtx, + bool is_sorted) +{ + // TODO: Runtime can be optimized by choosing a appropriate size for the + // subwarp dependent on the matrix properties + constexpr int subwarp_size = config::warp_size; + auto mtx_size = mtx->get_size(); + auto num_rows = static_cast(mtx_size[0]); + auto num_cols = static_cast(mtx_size[1]); + size_type row_ptrs_size = num_rows + 1; + + Array row_ptrs_addition(exec, row_ptrs_size); + Array needs_change_host{exec->get_master(), 1}; + needs_change_host.get_data()[0] = false; + Array needs_change_device{exec, 1}; + needs_change_device = needs_change_host; + + auto cuda_old_values = as_cuda_type(mtx->get_const_values()); + auto cuda_old_col_idxs = as_cuda_type(mtx->get_const_col_idxs()); + auto cuda_old_row_ptrs = as_cuda_type(mtx->get_row_ptrs()); + auto cuda_row_ptrs_add = as_cuda_type(row_ptrs_addition.get_data()); + + const dim3 block_dim{default_block_size, 1, 1}; + const dim3 grid_dim{ + static_cast(ceildiv(num_rows, block_dim.x / subwarp_size)), 1, + 1}; + if (is_sorted) { + kernel::find_missing_diagonal_elements + <<>>( + num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, + cuda_row_ptrs_add, + as_cuda_type(needs_change_device.get_data())); + } else { + kernel::find_missing_diagonal_elements + <<>>( + num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, + cuda_row_ptrs_add, + as_cuda_type(needs_change_device.get_data())); + } + needs_change_host = needs_change_device; + if (!needs_change_host.get_const_data()[0]) { + return; + } + + components::prefix_sum(exec, cuda_row_ptrs_add, row_ptrs_size); + exec->synchronize(); + + auto total_additions = + exec->copy_val_to_host(cuda_row_ptrs_add + row_ptrs_size - 1); + size_type new_num_elems = static_cast(total_additions) + + mtx->get_num_stored_elements(); + + + Array new_values{exec, new_num_elems}; + Array new_col_idxs{exec, new_num_elems}; + auto cuda_new_values = as_cuda_type(new_values.get_data()); + auto cuda_new_col_idxs = as_cuda_type(new_col_idxs.get_data()); + + kernel::add_missing_diagonal_elements + <<>>(num_rows, cuda_old_values, cuda_old_col_idxs, + cuda_old_row_ptrs, cuda_new_values, + cuda_new_col_idxs, cuda_row_ptrs_add); + + const dim3 grid_dim_row_ptrs_update{ + static_cast(ceildiv(num_rows, block_dim.x)), 1, 1}; + kernel::update_row_ptrs<<>>( + num_rows + 1, cuda_old_row_ptrs, cuda_row_ptrs_add); + + matrix::CsrBuilder mtx_builder{mtx}; + mtx_builder.get_value_array() = std::move(new_values); + mtx_builder.get_col_idx_array() = std::move(new_col_idxs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); + + +template +void initialize_row_ptrs_l_u( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs, IndexType *u_row_ptrs) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + + const dim3 block_size{default_block_size, 1, 1}; + const uint32 number_blocks = + ceildiv(num_rows, static_cast(block_size.x)); + const dim3 grid_dim{number_blocks, 1, 1}; + + kernel::count_nnz_per_l_u_row<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs)); + + components::prefix_sum(exec, l_row_ptrs, num_rows + 1); + components::prefix_sum(exec, u_row_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); + + +template +void initialize_l_u(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, + matrix::Csr *csr_u) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv( + num_rows, static_cast(block_size.x))), + 1, 1}; + + kernel::initialize_l_u<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(csr_l->get_const_row_ptrs()), + as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()), + as_cuda_type(csr_u->get_const_row_ptrs()), + as_cuda_type(csr_u->get_col_idxs()), as_cuda_type(csr_u->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); + + +template +void initialize_row_ptrs_l( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + + const dim3 block_size{default_block_size, 1, 1}; + const uint32 number_blocks = + ceildiv(num_rows, static_cast(block_size.x)); + const dim3 grid_dim{number_blocks, 1, 1}; + + kernel::count_nnz_per_l_row<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(l_row_ptrs)); + + components::prefix_sum(exec, l_row_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); + + +template +void initialize_l(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, bool diag_sqrt) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv( + num_rows, static_cast(block_size.x))), + 1, 1}; + + kernel::initialize_l<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(csr_l->get_const_row_ptrs()), + as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()), + diag_sqrt); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu new file mode 100644 index 00000000000..b7debb21bc3 --- /dev/null +++ b/cuda/factorization/ilu_kernels.cu @@ -0,0 +1,95 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/ilu_kernels.hpp" + + +#include + + +#include "cuda/base/cusparse_bindings.hpp" +#include "cuda/base/device_guard.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The ilu factorization namespace. + * + * @ingroup factor + */ +namespace ilu_factorization { + + +template +void compute_lu(std::shared_ptr exec, + matrix::Csr *m) +{ + const auto id = exec->get_device_id(); + auto handle = exec->get_cusparse_handle(); + gko::cuda::device_guard g{id}; + auto desc = cusparse::create_mat_descr(); + auto info = cusparse::create_ilu0_info(); + + // get buffer size for ILU + IndexType num_rows = m->get_size()[0]; + IndexType nnz = m->get_num_stored_elements(); + size_type buffer_size{}; + cusparse::ilu0_buffer_size(handle, num_rows, nnz, desc, + m->get_const_values(), m->get_const_row_ptrs(), + m->get_const_col_idxs(), info, buffer_size); + + Array buffer{exec, buffer_size}; + + // set up ILU(0) + cusparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), + info, CUSPARSE_SOLVE_POLICY_USE_LEVEL, + buffer.get_data()); + + cusparse::ilu0(handle, num_rows, nnz, desc, m->get_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), info, + CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + + cusparse::destroy(info); + cusparse::destroy(desc); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ILU_COMPUTE_LU_KERNEL); + + +} // namespace ilu_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu new file mode 100644 index 00000000000..98aa1c04831 --- /dev/null +++ b/cuda/factorization/par_ict_kernels.cu @@ -0,0 +1,209 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/math.hpp" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/searching.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ict_spgeam_kernels.hpp.inc" +#include "common/factorization/par_ict_sweep_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *llt, + const matrix::Csr *a, + const matrix::Csr *l, + matrix::Csr *l_new) +{ + auto num_rows = static_cast(llt->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + auto llt_row_ptrs = llt->get_const_row_ptrs(); + auto llt_col_idxs = llt->get_const_col_idxs(); + auto llt_vals = llt->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + // count non-zeros per row + kernel::ict_tri_spgeam_nnz + <<>>(llt_row_ptrs, llt_col_idxs, + a_row_ptrs, a_col_idxs, + l_new_row_ptrs, num_rows); + + // build row ptrs + components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // fill columns and values + kernel::ict_tri_spgeam_init + <<>>( + llt_row_ptrs, llt_col_idxs, as_cuda_type(llt_vals), a_row_ptrs, + a_col_idxs, as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, + as_cuda_type(l_vals), l_new_row_ptrs, l_new_col_idxs, + as_cuda_type(l_new_vals), num_rows); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +template +void compute_factor(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo) +{ + auto total_nnz = static_cast(l->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + kernel::ict_sweep<<>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_cuda_type(l->get_values()), + static_cast(l->get_num_stored_elements())); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *llt, + const matrix::Csr *a, + const matrix::Csr *l, + matrix::Csr *l_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + llt->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, llt, a, l, l_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = 2 * l->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_factor( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 6f212b9d75c..65e3798a881 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,14 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/par_ilu_kernels.hpp" -#include #include #include #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" -#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -57,186 +56,7 @@ namespace par_ilu_factorization { constexpr int default_block_size{512}; -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row( - size_type num_rows, const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row, - IndexType *__restrict__ u_nnz_row) -{ - const auto row = blockDim.x * blockIdx.x + threadIdx.x; - if (row < num_rows) { - IndexType l_row_nnz{}; - IndexType u_row_nnz{}; - for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) { - auto col = col_idxs[idx]; - l_row_nnz += (col <= row); - u_row_nnz += (row <= col); - } - l_nnz_row[row] = l_row_nnz; - u_nnz_row[row] = u_row_nnz; - } -} - - -} // namespace kernel - - -template -void initialize_row_ptrs_l_u( - std::shared_ptr exec, - const matrix::Csr *system_matrix, - IndexType *l_row_ptrs, IndexType *u_row_ptrs) -{ - const size_type num_rows{system_matrix->get_size()[0]}; - const size_type num_row_ptrs{num_rows + 1}; - - const dim3 block_size{default_block_size, 1, 1}; - const uint32 number_blocks = - ceildiv(num_rows, static_cast(block_size.x)); - const dim3 grid_dim{number_blocks, 1, 1}; - - kernel::count_nnz_per_l_u_row<<>>( - num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), - as_cuda_type(system_matrix->get_const_col_idxs()), - as_cuda_type(system_matrix->get_const_values()), - as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs)); - - Array block_sum(exec, grid_dim.x); - auto block_sum_ptr = block_sum.get_data(); - - start_prefix_sum<<>>( - num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr)); - finalize_prefix_sum<<>>( - num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr)); - - start_prefix_sum<<>>( - num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr)); - finalize_prefix_sum<<>>( - num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL); - - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void initialize_l_u( - size_type num_rows, const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, - const IndexType *__restrict__ l_row_ptrs, - IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values, - const IndexType *__restrict__ u_row_ptrs, - IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values) -{ - const auto row = blockDim.x * blockIdx.x + threadIdx.x; - if (row < num_rows) { - auto l_idx = l_row_ptrs[row]; - auto u_idx = u_row_ptrs[row]; - for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { - const auto col = col_idxs[i]; - const auto val = values[i]; - if (col <= row) { - l_col_idxs[l_idx] = col; - l_values[l_idx] = (col == row ? one() : val); - ++l_idx; - } - if (row <= col) { - u_col_idxs[u_idx] = col; - u_values[u_idx] = val; - ++u_idx; - } - } - } -} - - -} // namespace kernel - - -template -void initialize_l_u(std::shared_ptr exec, - const matrix::Csr *system_matrix, - matrix::Csr *csr_l, - matrix::Csr *csr_u) -{ - const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv( - num_rows, static_cast(block_size.x))), - 1, 1}; - - kernel::initialize_l_u<<>>( - num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), - as_cuda_type(system_matrix->get_const_col_idxs()), - as_cuda_type(system_matrix->get_const_values()), - as_cuda_type(csr_l->get_const_row_ptrs()), - as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()), - as_cuda_type(csr_u->get_const_row_ptrs()), - as_cuda_type(csr_u->get_col_idxs()), as_cuda_type(csr_u->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL); - - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void compute_l_u_factors( - size_type num_elements, const IndexType *__restrict__ row_idxs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, - const IndexType *__restrict__ l_row_ptrs, - const IndexType *__restrict__ l_col_idxs, ValueType *__restrict__ l_values, - const IndexType *__restrict__ u_row_ptrs, - const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values) -{ - const auto elem_id = blockDim.x * blockIdx.x + threadIdx.x; - if (elem_id < num_elements) { - const auto row = row_idxs[elem_id]; - const auto col = col_idxs[elem_id]; - const auto val = values[elem_id]; - auto l_idx = l_row_ptrs[row]; - auto u_idx = u_row_ptrs[col]; - ValueType sum{val}; - ValueType last_operation{}; - while (l_idx < l_row_ptrs[row + 1] && u_idx < u_row_ptrs[col + 1]) { - const auto l_col = l_col_idxs[l_idx]; - const auto u_col = u_col_idxs[u_idx]; - last_operation = zero(); - if (l_col == u_col) { - last_operation = l_values[l_idx] * u_values[u_idx]; - sum -= last_operation; - } - l_idx += (l_col <= u_col); - u_idx += (u_col <= l_col); - } - sum += last_operation; // undo the last operation - if (row > col) { - auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1]; - if (::gko::isfinite(to_write)) { - l_values[l_idx - 1] = to_write; - } - } else { - auto to_write = sum; - if (::gko::isfinite(to_write)) { - u_values[u_idx - 1] = to_write; - } - } - } -} - - -} // namespace kernel +#include "common/factorization/par_ilu_kernels.hpp.inc" template diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu new file mode 100644 index 00000000000..8b7b1a88443 --- /dev/null +++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu @@ -0,0 +1,206 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/factorization/par_ilut_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/atomic.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/sorting.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/factorization/par_ilut_select_common.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_filter_kernels.hpp.inc" +#include "common/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void threshold_filter_approx(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array *tmp, + remove_complex *threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp->resize_and_reset(tmp_size); + + auto total_counts = reinterpret_cast(tmp->get_data()); + auto partial_counts = + reinterpret_cast(tmp->get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp->get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp->get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank + auto bucket = static_cast( + sampleselect_find_bucket(exec, total_counts, rank).idx); + *threshold = + exec->copy_val_to_host(tree + kernel::searchtree_inner_size + bucket); + // we implicitly set the first splitter to -inf, but 0 works as well + if (bucket == 0) { + *threshold = zero(); + } + + // filter the elements + auto old_row_ptrs = m->get_const_row_ptrs(); + auto old_col_idxs = m->get_const_col_idxs(); + auto old_vals = m->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(m->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + kernel::bucket_filter_nnz<<>>( + old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); + + // build row pointers + components::prefix_sum(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType *new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + Array::view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + Array::view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + kernel::bucket_filter<<>>( + old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), oracles, num_rows, + bucket, new_row_ptrs, new_row_idxs, new_col_idxs, + as_cuda_type(new_vals)); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx, + threshold_filter_approx); + + +template +void threshold_filter_approx(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp, + remove_complex &threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo) +{ + auto num_rows = m->get_size()[0]; + auto total_nnz = m->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter_approx( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, + &threshold, m_out, m_out_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu new file mode 100644 index 00000000000..1b2e6e921f8 --- /dev/null +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -0,0 +1,162 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_filter_kernels.hpp.inc" + + +namespace { + + +template +void threshold_filter(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + remove_complex threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, bool lower) +{ + auto old_row_ptrs = a->get_const_row_ptrs(); + auto old_col_idxs = a->get_const_col_idxs(); + auto old_vals = a->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(a->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + kernel::threshold_filter_nnz + <<>>(old_row_ptrs, + as_cuda_type(old_vals), num_rows, + threshold, new_row_ptrs, lower); + + // build row pointers + components::prefix_sum(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType *new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + Array::view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + Array::view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + kernel::threshold_filter<<>>( + old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows, threshold, + new_row_ptrs, new_row_idxs, new_col_idxs, as_cuda_type(new_vals), + lower); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); + + +} // namespace + +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr *a, + remove_complex threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, bool lower) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, + m_out_coo, lower); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu new file mode 100644 index 00000000000..1b564801cee --- /dev/null +++ b/cuda/factorization/par_ilut_select_common.cu @@ -0,0 +1,117 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/factorization/par_ilut_select_common.cuh" + + +#include "core/factorization/par_ilut_kernels.hpp" +#include "cuda/base/math.hpp" +#include "cuda/components/atomic.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/searching.cuh" +#include "cuda/components/sorting.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +#include "common/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void sampleselect_count(const ValueType *values, IndexType size, + remove_complex *tree, unsigned char *oracles, + IndexType *partial_counts, IndexType *total_counts) +{ + constexpr auto bucket_count = kernel::searchtree_width; + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + // pick sample, build searchtree + kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size, + tree); + // determine bucket sizes + kernel::count_buckets<<>>( + as_cuda_type(values), size, tree, partial_counts, oracles, + items_per_thread); + // compute prefix sum and total sum over block-local values + kernel::block_prefix_sum<<>>( + partial_counts, total_counts, num_blocks); + // compute prefix sum over bucket counts + start_prefix_sum<<<1, bucket_count>>>( + bucket_count, total_counts, total_counts + bucket_count); +} + + +#define DECLARE_SSSS_COUNT(ValueType, IndexType) \ + void sampleselect_count(const ValueType *values, IndexType size, \ + remove_complex *tree, \ + unsigned char *oracles, IndexType *partial_counts, \ + IndexType *total_counts) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT); + + +template +sampleselect_bucket sampleselect_find_bucket( + std::shared_ptr exec, IndexType *prefix_sum, + IndexType rank) +{ + kernel::find_bucket<<<1, config::warp_size>>>(prefix_sum, rank); + IndexType values[3]{}; + exec->get_master()->copy_from(exec.get(), 3, prefix_sum, values); + return {values[0], values[1], values[2]}; +} + + +#define DECLARE_SSSS_FIND_BUCKET(IndexType) \ + sampleselect_bucket sampleselect_find_bucket( \ + std::shared_ptr exec, IndexType *prefix_sum, \ + IndexType rank) + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/cuda/factorization/par_ilut_select_common.cuh b/cuda/factorization/par_ilut_select_common.cuh new file mode 100644 index 00000000000..1f2eded3b0b --- /dev/null +++ b/cuda/factorization/par_ilut_select_common.cuh @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_ +#define GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_ + + +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace cuda { +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; +constexpr auto items_per_thread = 16; + + +template +void sampleselect_count(const ValueType *values, IndexType size, + remove_complex *tree, unsigned char *oracles, + IndexType *partial_counts, IndexType *total_counts); + + +template +struct sampleselect_bucket { + IndexType idx; + IndexType begin; + IndexType size; +}; + + +template +sampleselect_bucket sampleselect_find_bucket( + std::shared_ptr exec, IndexType *prefix_sum, + IndexType rank); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_ \ No newline at end of file diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu new file mode 100644 index 00000000000..469bde6ccc6 --- /dev/null +++ b/cuda/factorization/par_ilut_select_kernel.cu @@ -0,0 +1,184 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "cuda/base/math.hpp" +#include "cuda/components/atomic.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/searching.cuh" +#include "cuda/components/sorting.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/factorization/par_ilut_select_common.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +#include "common/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void sampleselect_filter(const ValueType *values, IndexType size, + const unsigned char *oracles, + const IndexType *partial_counts, IndexType bucket, + remove_complex *out) +{ + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + kernel::filter_bucket<<>>( + as_cuda_type(values), size, bucket, oracles, partial_counts, out, + items_per_thread); +} + + +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp1, + Array> &tmp2, + remove_complex &threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size_vals = + size / bucket_count * 4; // pessimistic estimate for temporary storage + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp1.resize_and_reset(tmp_size); + tmp2.resize_and_reset(tmp_size_vals); + + auto total_counts = reinterpret_cast(tmp1.get_data()); + auto partial_counts = + reinterpret_cast(tmp1.get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp1.get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp1.get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank, use bucket-local rank + auto bucket = sampleselect_find_bucket(exec, total_counts, rank); + rank -= bucket.begin; + + if (bucket.size * 2 > tmp_size_vals) { + // we need to reallocate tmp2 + tmp2.resize_and_reset(bucket.size * 2); + } + auto tmp21 = tmp2.get_data(); + auto tmp22 = tmp2.get_data() + bucket.size; + // extract target bucket + sampleselect_filter(values, size, oracles, partial_counts, bucket.idx, + tmp22); + + // recursively select from smaller buckets + int step{}; + while (bucket.size > kernel::basecase_size) { + std::swap(tmp21, tmp22); + const auto *tmp_in = tmp21; + auto tmp_out = tmp22; + + sampleselect_count(tmp_in, bucket.size, tree, oracles, partial_counts, + total_counts); + auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); + sampleselect_filter(tmp_in, bucket.size, oracles, partial_counts, + bucket.idx, tmp_out); + + rank -= new_bucket.begin; + bucket.size = new_bucket.size; + // we should never need more than 5 recursion steps, this would mean + // 256^5 = 2^40. fall back to standard library algorithm in that case. + ++step; + if (step > 5) { + Array cpu_out_array{ + exec->get_master(), + Array::view(exec, bucket.size, tmp_out)}; + auto begin = cpu_out_array.get_data(); + auto end = begin + bucket.size; + auto middle = begin + rank; + std::nth_element(begin, middle, end); + threshold = *middle; + return; + } + } + + // base case + auto out_ptr = reinterpret_cast(tmp1.get_data()); + kernel::basecase_select<<<1, kernel::basecase_block_size>>>( + tmp22, bucket.size, rank, out_ptr); + threshold = exec->copy_val_to_host(out_ptr); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernel.cu new file mode 100644 index 00000000000..1efb704e272 --- /dev/null +++ b/cuda/factorization/par_ilut_spgeam_kernel.cu @@ -0,0 +1,179 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/math.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/searching.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_spgeam_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *lu, + const matrix::Csr *a, + const matrix::Csr *l, + const matrix::Csr *u, + matrix::Csr *l_new, + matrix::Csr *u_new) +{ + auto num_rows = static_cast(lu->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + matrix::CsrBuilder u_new_builder(u_new); + auto lu_row_ptrs = lu->get_const_row_ptrs(); + auto lu_col_idxs = lu->get_const_col_idxs(); + auto lu_vals = lu->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + // count non-zeros per row + kernel::tri_spgeam_nnz<<>>( + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, + u_new_row_ptrs, num_rows); + + // build row ptrs + components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum(exec, u_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); + u_new_builder.get_value_array().resize_and_reset(u_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + // fill columns and values + kernel::tri_spgeam_init<<>>( + lu_row_ptrs, lu_col_idxs, as_cuda_type(lu_vals), a_row_ptrs, a_col_idxs, + as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, as_cuda_type(l_vals), + u_row_ptrs, u_col_idxs, as_cuda_type(u_vals), l_new_row_ptrs, + l_new_col_idxs, as_cuda_type(l_new_vals), u_new_row_ptrs, + u_new_col_idxs, as_cuda_type(u_new_vals), num_rows); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *lu, + const matrix::Csr *a, + const matrix::Csr *l, + const matrix::Csr *u, + matrix::Csr *l_new, + matrix::Csr *u_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + lu->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, + u_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu new file mode 100644 index 00000000000..91b68b723da --- /dev/null +++ b/cuda/factorization/par_ilut_sweep_kernel.cu @@ -0,0 +1,145 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/math.hpp" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/searching.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_sweep_kernels.hpp.inc" + + +namespace { + + +template +void compute_l_u_factors(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo, + matrix::Csr *u, + const matrix::Coo *u_coo, + matrix::Csr *u_csc) +{ + auto total_nnz = static_cast(l->get_num_stored_elements() + + u->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + kernel::sweep<<>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_cuda_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_cuda_type(u->get_values()), u_csc->get_const_row_ptrs(), + u_csc->get_const_col_idxs(), as_cuda_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, + compute_l_u_factors); + + +} // namespace + + +template +void compute_l_u_factors(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo, + matrix::Csr *u, + const matrix::Coo *u_coo, + matrix::Csr *u_csc) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + l->get_num_stored_elements() + u->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_l_u_factors( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, + u_csc); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index 446280ba185..ef94a07a8a2 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,7 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array.hpp" #include "core/matrix/dense_kernels.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" @@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/format_conversion.cuh" #include "cuda/components/segment_scan.cuh" -#include "cuda/components/zero_array.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -69,188 +71,10 @@ namespace coo { constexpr int default_block_size = 512; constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * cuda_config::warp_size; +constexpr int spmv_block_size = warps_in_block * config::warp_size; -namespace { - - -/** - * The device function of COO spmv - * - * @param nnz the number of nonzeros in the matrix - * @param num_lines the maximum round of each warp - * @param val the value array of the matrix - * @param col the column index array of the matrix - * @param row the row index array of the matrix - * @param b the input dense vector - * @param b_stride the stride of the input dense vector - * @param c the output dense vector - * @param c_stride the stride of the output dense vector - * @param scale the function on the added value - */ -template -__device__ void spmv_kernel(const size_type nnz, const size_type num_lines, - const ValueType *__restrict__ val, - const IndexType *__restrict__ col, - const IndexType *__restrict__ row, - const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride, Closure scale) -{ - ValueType temp_val = zero(); - const auto start = static_cast(blockDim.x) * blockIdx.x * - blockDim.y * num_lines + - threadIdx.y * blockDim.x * num_lines; - const auto column_id = blockIdx.y; - size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size); - num = min(num, num_lines); - const IndexType ind_start = start + threadIdx.x; - const IndexType ind_end = ind_start + (num - 1) * subwarp_size; - IndexType ind = ind_start; - IndexType curr_row = (ind < nnz) ? row[ind] : 0; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - for (; ind < ind_end; ind += subwarp_size) { - temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] - : zero(); - auto next_row = - (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; - // segmented scan - if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(c[curr_row * c_stride + column_id]), - scale(temp_val)); - } - temp_val = zero(); - } - curr_row = next_row; - } - if (num > 0) { - ind = ind_end; - temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] - : zero(); - // segmented scan - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); - } - } -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( - const size_type nnz, const size_type num_lines, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, - const IndexType *__restrict__ row, const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride) -{ - spmv_kernel(nnz, num_lines, val, col, row, b, b_stride, c, c_stride, - [](const ValueType &x) { return x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( - const size_type nnz, const size_type num_lines, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, - const IndexType *__restrict__ col, const IndexType *__restrict__ row, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) -{ - ValueType scale_factor = alpha[0]; - spmv_kernel( - nnz, num_lines, val, col, row, b, b_stride, c, c_stride, - [&scale_factor](const ValueType &x) { return scale_factor * x; }); -} - - -/** - * The device function of COO spmm - * - * @param nnz the number of nonzeros in the matrix - * @param num_elems the maximum number of nonzeros in each warp - * @param val the value array of the matrix - * @param col the column index array of the matrix - * @param row the row index array of the matrix - * @param num_cols the number of columns of the matrix - * @param b the input dense vector - * @param b_stride the stride of the input dense vector - * @param c the output dense vector - * @param c_stride the stride of the output dense vector - * @param scale the function on the added value - */ -template -__device__ void spmm_kernel(const size_type nnz, const size_type num_elems, - const ValueType *__restrict__ val, - const IndexType *__restrict__ col, - const IndexType *__restrict__ row, - const size_type num_cols, - const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride, Closure scale) -{ - ValueType temp = zero(); - const auto coo_idx = - (static_cast(blockDim.y) * blockIdx.x + threadIdx.y) * - num_elems; - const auto column_id = blockIdx.y * blockDim.x + threadIdx.x; - const auto coo_end = - (coo_idx + num_elems > nnz) ? nnz : coo_idx + num_elems; - if (column_id < num_cols && coo_idx < nnz) { - auto curr_row = row[coo_idx]; - auto idx = coo_idx; - for (; idx < coo_end - 1; idx++) { - temp += val[idx] * b[col[idx] * b_stride + column_id]; - const auto next_row = row[idx + 1]; - if (next_row != curr_row) { - atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp)); - curr_row = next_row; - temp = zero(); - } - } - temp += val[idx] * b[col[idx] * b_stride + column_id]; - atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp)); - } -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_spmm( - const size_type nnz, const size_type num_elems, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, - const IndexType *__restrict__ row, const size_type num_cols, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) -{ - spmm_kernel(nnz, num_elems, val, col, row, num_cols, b, b_stride, c, - c_stride, [](const ValueType &x) { return x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_spmm( - const size_type nnz, const size_type num_elems, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, - const IndexType *__restrict__ col, const IndexType *__restrict__ row, - const size_type num_cols, const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride) -{ - ValueType scale_factor = alpha[0]; - spmm_kernel( - nnz, num_elems, val, col, row, num_cols, b, b_stride, c, c_stride, - [&scale_factor](const ValueType &x) { return scale_factor * x; }); -} - - -} // namespace +#include "common/matrix/coo_kernels.hpp.inc" template @@ -258,7 +82,8 @@ void spmv(std::shared_ptr exec, const matrix::Coo *a, const matrix::Dense *b, matrix::Dense *c) { - zero_array(c->get_num_stored_elements(), c->get_values()); + components::fill_array(exec, c->get_values(), c->get_num_stored_elements(), + zero()); spmv2(exec, a, b, c); } @@ -289,23 +114,23 @@ void spmv2(std::shared_ptr exec, { const auto nnz = a->get_num_stored_elements(); const auto b_ncols = b->get_size()[1]; - const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1); + const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); if (nwarps > 0) { if (b_ncols < 4) { const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); - int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size); + int num_lines = ceildiv(nnz, nwarps * config::warp_size); abstract_spmv<<>>( nnz, num_lines, as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()), as_cuda_type(b->get_const_values()), b->get_stride(), as_cuda_type(c->get_values()), c->get_stride()); } else { - int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) * - cuda_config::warp_size; + int num_elems = + ceildiv(nnz, nwarps * config::warp_size) * config::warp_size; const dim3 coo_grid(ceildiv(nwarps, warps_in_block), - ceildiv(b_ncols, cuda_config::warp_size)); + ceildiv(b_ncols, config::warp_size)); abstract_spmm<<>>( nnz, num_elems, as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()), @@ -327,12 +152,12 @@ void advanced_spmv2(std::shared_ptr exec, { const auto nnz = a->get_num_stored_elements(); const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); - const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1); + const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto b_ncols = b->get_size()[1]; if (nwarps > 0) { if (b_ncols < 4) { - int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size); + int num_lines = ceildiv(nnz, nwarps * config::warp_size); const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); abstract_spmv<<>>( nnz, num_lines, as_cuda_type(alpha->get_const_values()), @@ -341,10 +166,10 @@ void advanced_spmv2(std::shared_ptr exec, as_cuda_type(b->get_const_values()), b->get_stride(), as_cuda_type(c->get_values()), c->get_stride()); } else { - int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) * - cuda_config::warp_size; + int num_elems = + ceildiv(nnz, nwarps * config::warp_size) * config::warp_size; const dim3 coo_grid(ceildiv(nwarps, warps_in_block), - ceildiv(b_ncols, cuda_config::warp_size)); + ceildiv(b_ncols, config::warp_size)); abstract_spmm<<>>( nnz, num_elems, as_cuda_type(alpha->get_const_values()), as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), @@ -358,31 +183,6 @@ void advanced_spmv2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -namespace kernel { - -template -__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs( - const IndexType *__restrict__ idxs, size_type num_nonzeros, - IndexType *__restrict__ ptrs, size_type length) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - - if (tidx == 0) { - ptrs[0] = 0; - ptrs[length - 1] = num_nonzeros; - } - - if (0 < tidx && tidx < num_nonzeros) { - if (idxs[tidx - 1] < idxs[tidx]) { - for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) { - ptrs[i] = tidx; - } - } - } -} - -} // namespace kernel - template void convert_row_idxs_to_ptrs(std::shared_ptr exec, @@ -398,8 +198,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr exec, template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Coo *source) + const matrix::Coo *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; @@ -416,44 +216,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); -namespace kernel { - - -template -__global__ - __launch_bounds__(cuda_config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type nnz, const IndexType *__restrict__ row_idxs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, size_type stride, - ValueType *__restrict__ result) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - if (tidx < nnz) { - result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx]; - } -} - - -} // namespace kernel - - template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Coo *source) + const matrix::Coo *source, + matrix::Dense *result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -461,9 +227,8 @@ void convert_to_dense(std::shared_ptr exec, const auto nnz = source->get_num_stored_elements(); - const dim3 block_size(cuda_config::warp_size, - cuda_config::max_block_size / cuda_config::warp_size, - 1); + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); const dim3 init_grid_dim(ceildiv(stride, block_size.x), ceildiv(num_rows, block_size.y), 1); kernel::initialize_zero_dense<<>>( diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index d0b46c83c51..f1781d2ce5c 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -45,19 +46,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/prefix_sum.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/merging.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/segment_scan.cuh" +#include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/zero_array.hpp" namespace gko { @@ -73,9 +79,9 @@ namespace csr { constexpr int default_block_size = 512; constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * cuda_config::warp_size; -constexpr int classical_block_size = 64; -constexpr int wsize = cuda_config::warp_size; +constexpr int spmv_block_size = warps_in_block * config::warp_size; +constexpr int wsize = config::warp_size; +constexpr int classical_overweight = 32; /** @@ -84,463 +90,14 @@ constexpr int wsize = cuda_config::warp_size; */ using compiled_kernels = syn::value_list; +using classical_kernels = + syn::value_list; -namespace kernel { +using spgeam_kernels = + syn::value_list; -template -__host__ __device__ __forceinline__ T ceildivT(T nom, T denom) -{ - return (nom + denom - 1ll) / denom; -} - - -template -__device__ __forceinline__ bool block_segment_scan_reverse( - const IndexType *__restrict__ ind, ValueType *__restrict__ val) -{ - bool last = true; - const auto reg_ind = ind[threadIdx.x]; -#pragma unroll - for (int i = 1; i < spmv_block_size; i <<= 1) { - if (i == 1 && threadIdx.x < spmv_block_size - 1 && - reg_ind == ind[threadIdx.x + 1]) { - last = false; - } - auto temp = zero(); - if (threadIdx.x >= i && reg_ind == ind[threadIdx.x - i]) { - temp = val[threadIdx.x - i]; - } - group::this_thread_block().sync(); - val[threadIdx.x] += temp; - group::this_thread_block().sync(); - } - - return last; -} - - -template -__device__ __forceinline__ void find_next_row( - const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType *__restrict__ row, IndexType *__restrict__ row_end, - const IndexType row_predict, const IndexType row_predict_end, - const IndexType *__restrict__ row_ptr) -{ - if (!overflow || ind < data_size) { - if (ind >= *row_end) { - *row = row_predict; - *row_end = row_predict_end; - for (; ind >= *row_end; *row_end = row_ptr[++*row + 1]) - ; - } - - } else { - *row = num_rows - 1; - *row_end = data_size; - } -} - - -template -__device__ __forceinline__ void warp_atomic_add( - const group::thread_block_tile &group, bool force_write, - ValueType *__restrict__ val, const IndexType row, ValueType *__restrict__ c, - const size_type c_stride, const IndexType column_id, Closure scale) -{ - // do a local scan to avoid atomic collisions - const bool need_write = segment_scan(group, row, val); - if (need_write && force_write) { - atomic_add(&(c[row * c_stride + column_id]), scale(*val)); - } - if (!need_write || force_write) { - *val = zero(); - } -} - - -template -__device__ __forceinline__ void process_window( - const group::thread_block_tile &group, - const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType *__restrict__ row, IndexType *__restrict__ row_end, - IndexType *__restrict__ nrow, IndexType *__restrict__ nrow_end, - ValueType *__restrict__ temp_val, const ValueType *__restrict__ val, - const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride, const IndexType column_id, Closure scale) -{ - const IndexType curr_row = *row; - find_next_row(num_rows, data_size, ind, row, row_end, *nrow, - *nrow_end, row_ptrs); - // segmented scan - if (group.any(curr_row != *row)) { - warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c, - c_stride, column_id, scale); - *nrow = group.shfl(*row, subwarp_size - 1); - *nrow_end = group.shfl(*row_end, subwarp_size - 1); - } - - if (!last || ind < data_size) { - const auto col = col_idxs[ind]; - *temp_val += val[ind] * b[col * b_stride + column_id]; - } -} - - -template -__device__ __forceinline__ IndexType get_warp_start_idx( - const IndexType nwarps, const IndexType nnz, const IndexType warp_idx) -{ - const long long cache_lines = ceildivT(nnz, wsize); - return (warp_idx * cache_lines / nwarps) * wsize; -} - - -template -__device__ __forceinline__ void spmv_kernel( - const IndexType nwarps, const IndexType num_rows, - const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride, Closure scale) -{ - const IndexType warp_idx = blockIdx.x * warps_in_block + threadIdx.y; - const IndexType column_id = blockIdx.y; - if (warp_idx >= nwarps) { - return; - } - const IndexType data_size = row_ptrs[num_rows]; - const IndexType start = get_warp_start_idx(nwarps, data_size, warp_idx); - const IndexType end = - min(get_warp_start_idx(nwarps, data_size, warp_idx + 1), - ceildivT(data_size, wsize) * wsize); - auto row = srow[warp_idx]; - auto row_end = row_ptrs[row + 1]; - auto nrow = row; - auto nrow_end = row_end; - ValueType temp_val = zero(); - IndexType ind = start + threadIdx.x; - find_next_row(num_rows, data_size, ind, &row, &row_end, nrow, - nrow_end, row_ptrs); - const IndexType ind_end = end - wsize; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - for (; ind < ind_end; ind += wsize) { - process_window(tile_block, num_rows, data_size, ind, &row, - &row_end, &nrow, &nrow_end, &temp_val, val, - col_idxs, row_ptrs, b, b_stride, c, c_stride, - column_id, scale); - } - process_window(tile_block, num_rows, data_size, ind, &row, &row_end, - &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs, - b, b_stride, c, c_stride, column_id, scale); - warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id, - scale); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( - const IndexType nwarps, const IndexType num_rows, - const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) -{ - spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, - c_stride, [](const ValueType &x) { return x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_spmv( - const IndexType nwarps, const IndexType num_rows, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, - const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) -{ - ValueType scale_factor = alpha[0]; - spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, - c_stride, [&scale_factor](const ValueType &x) { - return scale_factor * x; - }); -} - - -template -__global__ __launch_bounds__(default_block_size) void set_zero( - const size_type nnz, ValueType *__restrict__ val) -{ - const auto ind = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - if (ind < nnz) { - val[ind] = zero(); - } -} - - -template -__forceinline__ __device__ void merge_path_search( - const IndexType diagonal, const IndexType a_len, const IndexType b_len, - const IndexType *__restrict__ a, const IndexType offset_b, - IndexType *__restrict__ x, IndexType *__restrict__ y) -{ - auto x_min = max(diagonal - b_len, zero()); - auto x_max = min(diagonal, a_len); - while (x_min < x_max) { - auto pivot = (x_min + x_max) >> 1; - if (a[pivot] <= offset_b + diagonal - pivot - 1) { - x_min = pivot + 1; - } else { - x_max = pivot; - } - } - - *x = min(x_min, a_len); - *y = diagonal - x_min; -} - - -template -__device__ void reduce(const IndexType nwarps, - const ValueType *__restrict__ last_val, - const IndexType *__restrict__ last_row, - ValueType *__restrict__ c, const size_type c_stride, - Alpha_op alpha_op) -{ - const IndexType cache_lines = ceildivT(nwarps, spmv_block_size); - const IndexType tid = threadIdx.x; - const IndexType start = min(tid * cache_lines, nwarps); - const IndexType end = min((tid + 1) * cache_lines, nwarps); - ValueType value = zero(); - IndexType row = last_row[nwarps - 1]; - if (start < nwarps) { - value = last_val[start]; - row = last_row[start]; - for (IndexType i = start + 1; i < end; i++) { - if (last_row[i] != row) { - c[row * c_stride] += alpha_op(value); - row = last_row[i]; - value = last_val[i]; - } else { - value += last_val[i]; - } - } - } - __shared__ UninitializedArray tmp_ind; - __shared__ UninitializedArray tmp_val; - tmp_val[threadIdx.x] = value; - tmp_ind[threadIdx.x] = row; - group::this_thread_block().sync(); - bool last = block_segment_scan_reverse(static_cast(tmp_ind), - static_cast(tmp_val)); - group::this_thread_block().sync(); - if (last) { - c[row * c_stride] += alpha_op(tmp_val[threadIdx.x]); - } -} - - -template -__device__ void merge_path_spmv( - const IndexType num_rows, const ValueType *__restrict__ val, - const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride, - IndexType *__restrict__ row_out, ValueType *__restrict__ val_out, - Alpha_op alpha_op, Beta_op beta_op) -{ - const auto *row_end_ptrs = row_ptrs + 1; - const auto nnz = row_ptrs[num_rows]; - const IndexType num_merge_items = num_rows + nnz; - const auto block_items = spmv_block_size * items_per_thread; - __shared__ IndexType shared_row_ptrs[block_items]; - const IndexType diagonal = - min(static_cast(block_items * blockIdx.x), num_merge_items); - const IndexType diagonal_end = min(diagonal + block_items, num_merge_items); - IndexType block_start_x; - IndexType block_start_y; - IndexType end_x; - IndexType end_y; - merge_path_search(diagonal, num_rows, nnz, row_end_ptrs, zero(), - &block_start_x, &block_start_y); - merge_path_search(diagonal_end, num_rows, nnz, row_end_ptrs, - zero(), &end_x, &end_y); - const IndexType block_num_rows = end_x - block_start_x; - const IndexType block_num_nonzeros = end_y - block_start_y; - for (int i = threadIdx.x; - i < block_num_rows && block_start_x + i < num_rows; - i += spmv_block_size) { - shared_row_ptrs[i] = row_end_ptrs[block_start_x + i]; - } - group::this_thread_block().sync(); - - IndexType start_x; - IndexType start_y; - merge_path_search(static_cast(items_per_thread * threadIdx.x), - block_num_rows, block_num_nonzeros, shared_row_ptrs, - block_start_y, &start_x, &start_y); - - - IndexType ind = block_start_y + start_y; - IndexType row_i = block_start_x + start_x; - ValueType value = zero(); -#pragma unroll - for (IndexType i = 0; i < items_per_thread; i++) { - if (row_i < num_rows) { - if (start_x == block_num_rows || ind < shared_row_ptrs[start_x]) { - value += val[ind] * b[col_idxs[ind] * b_stride]; - ind++; - } else { - c[row_i * c_stride] = - alpha_op(value) + beta_op(c[row_i * c_stride]); - start_x++; - row_i++; - value = zero(); - } - } - } - group::this_thread_block().sync(); - IndexType *tmp_ind = shared_row_ptrs; - ValueType *tmp_val = - reinterpret_cast(shared_row_ptrs + spmv_block_size); - tmp_val[threadIdx.x] = value; - tmp_ind[threadIdx.x] = row_i; - group::this_thread_block().sync(); - bool last = block_segment_scan_reverse(static_cast(tmp_ind), - static_cast(tmp_val)); - if (threadIdx.x == spmv_block_size - 1) { - row_out[blockIdx.x] = min(end_x, num_rows - 1); - val_out[blockIdx.x] = tmp_val[threadIdx.x]; - } else if (last) { - c[row_i * c_stride] += alpha_op(tmp_val[threadIdx.x]); - } -} - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv( - const IndexType num_rows, const ValueType *__restrict__ val, - const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride, - IndexType *__restrict__ row_out, ValueType *__restrict__ val_out) -{ - merge_path_spmv( - num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride, - row_out, val_out, [](ValueType &x) { return x; }, - [](ValueType &x) { return zero(); }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_merge_path_spmv( - const IndexType num_rows, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const IndexType *__restrict__ srow, - const ValueType *__restrict__ b, const size_type b_stride, - const ValueType *__restrict__ beta, ValueType *__restrict__ c, - const size_type c_stride, IndexType *__restrict__ row_out, - ValueType *__restrict__ val_out) -{ - const auto alpha_val = alpha[0]; - const auto beta_val = beta[0]; - merge_path_spmv( - num_rows, val, col_idxs, row_ptrs, srow, b, b_stride, c, c_stride, - row_out, val_out, [&alpha_val](ValueType &x) { return alpha_val * x; }, - [&beta_val](ValueType &x) { return beta_val * x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_reduce( - const IndexType nwarps, const ValueType *__restrict__ last_val, - const IndexType *__restrict__ last_row, ValueType *__restrict__ c, - const size_type c_stride) -{ - reduce(nwarps, last_val, last_row, c, c_stride, - [](ValueType &x) { return x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_reduce( - const IndexType nwarps, const ValueType *__restrict__ last_val, - const IndexType *__restrict__ last_row, const ValueType *__restrict__ alpha, - ValueType *__restrict__ c, const size_type c_stride) -{ - const auto alpha_val = alpha[0]; - reduce(nwarps, last_val, last_row, c, c_stride, - [&alpha_val](ValueType &x) { return alpha_val * x; }); -} - - -template -__device__ void classical_spmv(const size_type num_rows, - const ValueType *__restrict__ val, - const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, - const ValueType *__restrict__ b, - const size_type b_stride, - ValueType *__restrict__ c, - const size_type c_stride, Closure scale) -{ - const auto tid = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - if (tid >= num_rows) { - return; - } - const auto column_id = blockIdx.y; - const auto ind_end = row_ptrs[tid + 1]; - ValueType temp_value = zero(); - for (auto ind = row_ptrs[tid]; ind < ind_end; ind++) { - temp_value += val[ind] * b[col_idxs[ind] * b_stride + column_id]; - } - c[tid * c_stride + column_id] = - scale(temp_value, c[tid * c_stride + column_id]); -} - - -template -__global__ __launch_bounds__(classical_block_size) void abstract_classical_spmv( - const size_type num_rows, const ValueType *__restrict__ val, - const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride) -{ - classical_spmv(num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride, - [](const ValueType &x, const ValueType &y) { return x; }); -} - - -template -__global__ __launch_bounds__(classical_block_size) void abstract_classical_spmv( - const size_type num_rows, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ val, const IndexType *__restrict__ col_idxs, - const IndexType *__restrict__ row_ptrs, const ValueType *__restrict__ b, - const size_type b_stride, const ValueType *__restrict__ beta, - ValueType *__restrict__ c, const size_type c_stride) -{ - const auto alpha_val = alpha[0]; - const auto beta_val = beta[0]; - classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride, - [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { - return alpha_val * x + beta_val * y; - }); -} - - -} // namespace kernel +#include "common/matrix/csr_kernels.hpp.inc" namespace host_kernel { @@ -614,8 +171,8 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); template int compute_items_per_thread(std::shared_ptr exec) { - const int version = exec->get_major_version() - << 4 + exec->get_minor_version(); + const int version = + (exec->get_major_version() << 4) + exec->get_minor_version(); // The num_item is decided to make the occupancy 100% // TODO: Extend this list when new GPU is released // Tune this parameter @@ -640,7 +197,7 @@ int compute_items_per_thread(std::shared_ptr exec) case 0x37: num_item = 14; } - // Ensure that satisfy: + // Ensure that the following is satisfied: // sizeof(IndexType) + sizeof(ValueType) // <= items_per_thread * sizeof(IndexType) constexpr int minimal_num = @@ -650,6 +207,46 @@ int compute_items_per_thread(std::shared_ptr exec) } +template +void classical_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) +{ + const auto nwarps = exec->get_num_warps_per_sm() * + exec->get_num_multiprocessor() * classical_overweight; + const auto gridx = + std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + int64(nwarps / warps_in_block)); + const dim3 grid(gridx, b->get_size()[1]); + const dim3 block(spmv_block_size); + + if (alpha == nullptr && beta == nullptr) { + kernel::abstract_classical_spmv<<>>( + a->get_size()[0], as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(c->get_values()), c->get_stride()); + + } else if (alpha != nullptr && beta != nullptr) { + kernel::abstract_classical_spmv<<>>( + a->get_size()[0], as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(beta->get_const_values()), + as_cuda_type(c->get_values()), c->get_stride()); + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); + + } // namespace host_kernel @@ -659,10 +256,11 @@ void spmv(std::shared_ptr exec, const matrix::Dense *b, matrix::Dense *c) { if (a->get_strategy()->get_name() == "load_balance") { - zero_array(c->get_num_stored_elements(), c->get_values()); + components::fill_array(exec, c->get_values(), + c->get_num_stored_elements(), zero()); const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { - const dim3 csr_block(cuda_config::warp_size, warps_in_block, 1); + const dim3 csr_block(config::warp_size, warps_in_block, 1); const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); kernel::abstract_spmv<<>>( @@ -686,14 +284,26 @@ void spmv(std::shared_ptr exec, }, syn::value_list(), syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "classical") { - const dim3 grid(ceildiv(a->get_size()[0], classical_block_size), - b->get_size()[1]); - kernel::abstract_classical_spmv<<>>( - a->get_size()[0], as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(c->get_values()), c->get_stride()); - } else if (a->get_strategy()->get_name() == "cusparse") { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>(a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + GKO_NOT_SUPPORTED(a->get_strategy()); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); + } else if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { if (cusparse::is_supported::value) { // TODO: add implementation for int64 and multiple RHS auto handle = exec->get_cusparse_handle(); @@ -739,7 +349,7 @@ void advanced_spmv(std::shared_ptr exec, const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { - const dim3 csr_block(cuda_config::warp_size, warps_in_block, 1); + const dim3 csr_block(config::warp_size, warps_in_block, 1); const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); kernel::abstract_spmv<<>>( @@ -751,8 +361,11 @@ void advanced_spmv(std::shared_ptr exec, as_cuda_type(b->get_const_values()), as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()), as_cuda_type(c->get_stride())); + } else { + GKO_NOT_SUPPORTED(nwarps); } - } else if (a->get_strategy()->get_name() == "cusparse") { + } else if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { if (cusparse::is_supported::value) { // TODO: add implementation for int64 and multiple RHS auto descr = cusparse::create_mat_descr(); @@ -776,15 +389,25 @@ void advanced_spmv(std::shared_ptr exec, GKO_NOT_IMPLEMENTED; } } else if (a->get_strategy()->get_name() == "classical") { - const dim3 grid(ceildiv(a->get_size()[0], classical_block_size), - b->get_size()[1]); - kernel::abstract_classical_spmv<<>>( - a->get_size()[0], as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(beta->get_const_values()), - as_cuda_type(c->get_values()), c->get_stride()); + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>(a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + GKO_NOT_SUPPORTED(a->get_strategy()); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, + beta); } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = host_kernel::compute_items_per_thread(exec); @@ -804,24 +427,228 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); -namespace kernel { +template +void spgemm(std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Csr *b, + matrix::Csr *c) +{ + if (cusparse::is_supported::value) { + auto handle = exec->get_cusparse_handle(); + cusparse::pointer_mode_guard pm_guard(handle); + auto a_descr = cusparse::create_mat_descr(); + auto b_descr = cusparse::create_mat_descr(); + auto c_descr = cusparse::create_mat_descr(); + auto d_descr = cusparse::create_mat_descr(); + auto info = cusparse::create_spgemm_info(); + + auto alpha = one(); + auto a_nnz = IndexType(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = IndexType(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto zero_nnz = IndexType{}; + auto m = IndexType(a->get_size()[0]); + auto n = IndexType(b->get_size()[1]); + auto k = IndexType(a->get_size()[1]); + auto c_row_ptrs = c->get_row_ptrs(); + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + + // allocate buffer + size_type buffer_size{}; + cusparse::spgemm_buffer_size( + handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, + zero_nnz, null_index, null_index, info, buffer_size); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + + // count nnz + IndexType c_nnz{}; + cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, + a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, + d_descr, zero_nnz, null_index, null_index, c_descr, + c_row_ptrs, &c_nnz, info, buffer); + + // accumulate non-zeros + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + cusparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, + null_value, null_index, null_index, c_descr, c_vals, + c_row_ptrs, c_col_idxs, info, buffer); + + cusparse::destroy(info); + cusparse::destroy(d_descr); + cusparse::destroy(c_descr); + cusparse::destroy(b_descr); + cusparse::destroy(a_descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void convert_row_ptrs_to_idxs( - size_type num_rows, const IndexType *__restrict__ ptrs, - IndexType *__restrict__ idxs) + +template +void advanced_spgemm(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Csr *b, + const matrix::Dense *beta, + const matrix::Csr *d, + matrix::Csr *c) { - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - if (tidx < num_rows) { - for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) { - idxs[i] = tidx; - } + if (cusparse::is_supported::value) { + auto handle = exec->get_cusparse_handle(); + cusparse::pointer_mode_guard pm_guard(handle); + auto a_descr = cusparse::create_mat_descr(); + auto b_descr = cusparse::create_mat_descr(); + auto c_descr = cusparse::create_mat_descr(); + auto d_descr = cusparse::create_mat_descr(); + auto info = cusparse::create_spgemm_info(); + + auto valpha = exec->copy_val_to_host(alpha->get_const_values()); + auto a_nnz = IndexType(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = IndexType(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto vbeta = exec->copy_val_to_host(beta->get_const_values()); + auto d_nnz = IndexType(d->get_num_stored_elements()); + auto d_vals = d->get_const_values(); + auto d_row_ptrs = d->get_const_row_ptrs(); + auto d_col_idxs = d->get_const_col_idxs(); + auto m = IndexType(a->get_size()[0]); + auto n = IndexType(b->get_size()[1]); + auto k = IndexType(a->get_size()[1]); + auto c_row_ptrs = c->get_row_ptrs(); + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + + // allocate buffer + size_type buffer_size{}; + cusparse::spgemm_buffer_size( + handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, + d_row_ptrs, d_col_idxs, info, buffer_size); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + + // count nnz + IndexType c_nnz{}; + cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, + a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, + d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr, + c_row_ptrs, &c_nnz, info, buffer); + + // accumulate non-zeros + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + cusparse::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, + d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs, + c_col_idxs, info, buffer); + + cusparse::destroy(info); + cusparse::destroy(d_descr); + cusparse::destroy(c_descr); + cusparse::destroy(b_descr); + cusparse::destroy(a_descr); + } else { + GKO_NOT_IMPLEMENTED; } } +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); + + +namespace { + + +template +void spgeam(syn::value_list, + std::shared_ptr exec, const ValueType *alpha, + const IndexType *a_row_ptrs, const IndexType *a_col_idxs, + const ValueType *a_vals, const ValueType *beta, + const IndexType *b_row_ptrs, const IndexType *b_col_idxs, + const ValueType *b_vals, matrix::Csr *c) +{ + auto m = static_cast(c->get_size()[0]); + auto c_row_ptrs = c->get_row_ptrs(); + // count nnz for alpha * A + beta * B + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(m, subwarps_per_block); + kernel::spgeam_nnz<<>>( + a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, m + 1); + + // accumulate non-zeros for alpha * A + beta * B + matrix::CsrBuilder c_builder{c}; + auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); + c_builder.get_col_idx_array().resize_and_reset(c_nnz); + c_builder.get_value_array().resize_and_reset(c_nnz); + auto c_col_idxs = c->get_col_idxs(); + auto c_vals = c->get_values(); + kernel::spgeam<<>>( + as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals), + as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m, + c_row_ptrs, c_col_idxs, as_cuda_type(c_vals)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); + -} // namespace kernel +} // namespace + + +template +void spgeam(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Dense *beta, + const matrix::Csr *b, + matrix::Csr *c) +{ + auto total_nnz = + a->get_num_stored_elements() + b->get_num_stored_elements(); + auto nnz_per_row = total_nnz / a->get_size()[0]; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), a->get_const_row_ptrs(), + a->get_const_col_idxs(), a->get_const_values(), + beta->get_const_values(), b->get_const_row_ptrs(), + b->get_const_col_idxs(), b->get_const_values(), c); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); template @@ -838,8 +665,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr exec, template void convert_to_coo(std::shared_ptr exec, - matrix::Coo *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Coo *result) { auto num_rows = result->get_size()[0]; @@ -853,46 +680,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); -namespace kernel { - - -template -__global__ - __launch_bounds__(cuda_config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type num_rows, const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, size_type stride, - ValueType *__restrict__ result) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - if (tidx < num_rows) { - for (auto i = row_ptrs[tidx]; i < row_ptrs[tidx + 1]; i++) { - result[stride * tidx + col_idxs[i]] = values[i]; - } - } -} - - -} // namespace kernel - - template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Dense *result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -901,9 +692,8 @@ void convert_to_dense(std::shared_ptr exec, const auto col_idxs = source->get_const_col_idxs(); const auto vals = source->get_const_values(); - const dim3 block_size(cuda_config::warp_size, - cuda_config::max_block_size / cuda_config::warp_size, - 1); + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); const dim3 init_grid_dim(ceildiv(stride, block_size.x), ceildiv(num_rows, block_size.y), 1); kernel::initialize_zero_dense<<>>( @@ -919,98 +709,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void calculate_nnz_per_row( - size_type num_rows, const IndexType *__restrict__ row_ptrs, - size_type *__restrict__ nnz_per_row) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - if (tidx < num_rows) { - nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx]; - } -} - - -__global__ - __launch_bounds__(cuda_config::warp_size) void calculate_slice_lengths( - size_type num_rows, size_type slice_size, size_type stride_factor, - const size_type *__restrict__ nnz_per_row, - size_type *__restrict__ slice_lengths, - size_type *__restrict__ slice_sets) -{ - constexpr auto warp_size = cuda_config::warp_size; - const auto sliceid = blockIdx.x; - const auto tid_in_warp = threadIdx.x; - - if (sliceid * slice_size + tid_in_warp < num_rows) { - size_type thread_result = 0; - for (auto i = tid_in_warp; i < slice_size; i += warp_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - auto warp_result = gko::kernels::cuda::reduce( - warp_tile, thread_result, - [](const size_type &a, const size_type &b) { return max(a, b); }); - - if (tid_in_warp == 0) { - auto slice_length = - ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_sellp( - size_type num_rows, size_type slice_size, - const ValueType *__restrict__ source_values, - const IndexType *__restrict__ source_row_ptrs, - const IndexType *__restrict__ source_col_idxs, - size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets, - IndexType *__restrict__ result_col_idxs, - ValueType *__restrict__ result_values) -{ - const auto global_row = threadIdx.x + blockIdx.x * blockDim.x; - const auto row = global_row % slice_size; - const auto sliceid = global_row / slice_size; - - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[sliceid] * slice_size + row; - - for (size_type csr_ind = source_row_ptrs[global_row]; - csr_ind < source_row_ptrs[global_row + 1]; csr_ind++) { - result_values[sellp_ind] = source_values[csr_ind]; - result_col_idxs[sellp_ind] = source_col_idxs[csr_ind]; - sellp_ind += slice_size; - } - for (size_type i = sellp_ind; - i < - (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; - i += slice_size) { - result_col_idxs[i] = 0; - result_values[i] = zero(); - } - } -} - - -} // namespace kernel - - template void convert_to_sellp(std::shared_ptr exec, - matrix::Sellp *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Sellp *result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -1041,22 +743,12 @@ void convert_to_sellp(std::shared_ptr exec, grid_dim = slice_num; - kernel::calculate_slice_lengths<<>>( + kernel::calculate_slice_lengths<<>>( num_rows, slice_size, stride_factor, as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths), as_cuda_type(slice_sets)); - auto add_values = - Array(exec, ceildiv(slice_num + 1, default_block_size)); - grid_dim = ceildiv(slice_num + 1, default_block_size); - - start_prefix_sum<<>>( - slice_num + 1, as_cuda_type(slice_sets), - as_cuda_type(add_values.get_data())); - - finalize_prefix_sum<<>>( - slice_num + 1, as_cuda_type(slice_sets), - as_cuda_type(add_values.get_const_data())); + components::prefix_sum(exec, slice_sets, slice_num + 1); grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_sellp<<>>( @@ -1064,66 +756,16 @@ void convert_to_sellp(std::shared_ptr exec, as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs), as_cuda_type(slice_lengths), as_cuda_type(slice_sets), as_cuda_type(result_col_idxs), as_cuda_type(result_values)); - - nnz_per_row.clear(); - add_values.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void initialize_zero_ell( - size_type max_nnz_per_row, size_type stride, ValueType *__restrict__ values, - IndexType *__restrict__ col_idxs) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - - if (tidx < stride * max_nnz_per_row) { - values[tidx] = zero(); - col_idxs[tidx] = 0; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_ell( - size_type num_rows, size_type stride, - const ValueType *__restrict__ source_values, - const IndexType *__restrict__ source_row_ptrs, - const IndexType *__restrict__ source_col_idxs, - ValueType *__restrict__ result_values, - IndexType *__restrict__ result_col_idxs) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - constexpr auto warp_size = cuda_config::warp_size; - const auto row = tidx / warp_size; - const auto local_tidx = tidx % warp_size; - - if (row < num_rows) { - for (size_type i = local_tidx; - i < source_row_ptrs[row + 1] - source_row_ptrs[row]; - i += warp_size) { - const auto result_idx = row + stride * i; - const auto source_idx = i + source_row_ptrs[row]; - result_values[result_idx] = source_values[source_idx]; - result_col_idxs[result_idx] = source_col_idxs[source_idx]; - } - } -} - - -} // namespace kernel - - template void convert_to_ell(std::shared_ptr exec, - matrix::Ell *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Ell *result) { const auto source_values = source->get_const_values(); const auto source_row_ptrs = source->get_const_row_ptrs(); @@ -1144,7 +786,7 @@ void convert_to_ell(std::shared_ptr exec, as_cuda_type(result_col_idxs)); const auto grid_dim = - ceildiv(num_rows * cuda_config::warp_size, default_block_size); + ceildiv(num_rows * config::warp_size, default_block_size); kernel::fill_in_ell<<>>( num_rows, stride, as_cuda_type(source_values), @@ -1156,57 +798,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); -namespace kernel { - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice( - size_type num_rows, size_type slice_size, size_type stride_factor, - const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - constexpr auto warp_size = cuda_config::warp_size; - const auto warpid = tidx / warp_size; - const auto tid_in_warp = tidx % warp_size; - const auto slice_num = ceildiv(num_rows, slice_size); - - size_type thread_result = 0; - for (auto i = tid_in_warp; i < slice_size; i += warp_size) { - if (warpid * slice_size + i < num_rows) { - thread_result = - max(thread_result, nnz_per_row[warpid * slice_size + i]); - } - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - auto warp_result = gko::kernels::cuda::reduce( - warp_tile, thread_result, - [](const size_type &a, const size_type &b) { return max(a, b); }); - - if (tid_in_warp == 0 && warpid < slice_num) { - result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_total_cols( - size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, - size_type *__restrict__ result) -{ - extern __shared__ size_type block_result[]; - - reduce_array(num_slices, max_nnz_per_slice, block_result, - [](const size_type &x, const size_type &y) { return x + y; }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_result[0]; - } -} - - -} // namespace kernel - - template void calculate_total_cols(std::shared_ptr exec, const matrix::Csr *source, @@ -1223,7 +814,7 @@ void calculate_total_cols(std::shared_ptr exec, kernel::calculate_nnz_per_row<<>>( num_rows, as_cuda_type(row_ptrs), as_cuda_type(nnz_per_row.get_data())); - grid_dim = ceildiv(slice_num * cuda_config::warp_size, default_block_size); + grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); auto max_nnz_per_slice = Array(exec, slice_num); kernel::reduce_max_nnz_per_slice<<>>( @@ -1234,25 +825,17 @@ void calculate_total_cols(std::shared_ptr exec, grid_dim = ceildiv(slice_num, default_block_size); auto block_results = Array(exec, grid_dim); - kernel::reduce_total_cols<<>>( + kernel::reduce_total_cols<<>>( slice_num, as_cuda_type(max_nnz_per_slice.get_const_data()), as_cuda_type(block_results.get_data())); auto d_result = Array(exec, 1); - kernel::reduce_total_cols<<<1, default_block_size, - default_block_size * sizeof(size_type)>>>( + kernel::reduce_total_cols<<<1, default_block_size>>>( grid_dim, as_cuda_type(block_results.get_const_data()), as_cuda_type(d_result.get_data())); - exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(), - result); - - block_results.clear(); - nnz_per_row.clear(); - max_nnz_per_slice.clear(); - d_result.clear(); + *result = exec->copy_val_to_host(d_result.get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1261,8 +844,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void transpose(std::shared_ptr exec, - matrix::Csr *trans, - const matrix::Csr *orig) + const matrix::Csr *orig, + matrix::Csr *trans) { if (cusparse::is_supported::value) { cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; @@ -1279,33 +862,13 @@ void transpose(std::shared_ptr exec, } } - GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); -namespace { - - -template -__global__ __launch_bounds__(default_block_size) void conjugate_kernel( - size_type num_nonzeros, ValueType *__restrict__ val) -{ - const auto tidx = - static_cast(blockIdx.x) * default_block_size + threadIdx.x; - - if (tidx < num_nonzeros) { - val[tidx] = conj(val[tidx]); - } -} - - -} // namespace - - template void conj_transpose(std::shared_ptr exec, - matrix::Csr *trans, - const matrix::Csr *orig) + const matrix::Csr *orig, + matrix::Csr *trans) { if (cusparse::is_supported::value) { const dim3 block_size(default_block_size, 1, 1); @@ -1334,26 +897,48 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); -namespace kernel { +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) + GKO_NOT_IMPLEMENTED; +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz( - size_type size, const size_type *__restrict__ nnz_per_row, - size_type *__restrict__ result) -{ - extern __shared__ size_type block_max[]; - reduce_array( - size, nnz_per_row, block_max, - [](const size_type &x, const size_type &y) { return max(x, y); }); +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL); - if (threadIdx.x == 0) { - result[blockIdx.x] = block_max[0]; - } -} +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) + GKO_NOT_IMPLEMENTED; -} // namespace kernel +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); template @@ -1374,93 +959,25 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, const auto n = ceildiv(num_rows, default_block_size); const auto reduce_dim = n <= default_block_size ? n : default_block_size; - kernel::reduce_max_nnz<<>>( + kernel::reduce_max_nnz<<>>( num_rows, as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(block_results.get_data())); - kernel::reduce_max_nnz<<<1, default_block_size, - default_block_size * sizeof(size_type)>>>( + kernel::reduce_max_nnz<<<1, default_block_size>>>( reduce_dim, as_cuda_type(block_results.get_const_data()), as_cuda_type(d_result.get_data())); - exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(), - result); - - nnz_per_row.clear(); - block_results.clear(); - d_result.clear(); + *result = exec->copy_val_to_host(d_result.get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); -namespace kernel { - - -template -__global__ - __launch_bounds__(default_block_size) void calculate_hybrid_coo_row_nnz( - size_type num_rows, size_type ell_max_nnz_per_row, - IndexType *__restrict__ csr_row_idxs, - size_type *__restrict__ coo_row_nnz) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - if (tidx < num_rows) { - const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx]; - coo_row_nnz[tidx] = - (csr_nnz > ell_max_nnz_per_row) * (csr_nnz - ell_max_nnz_per_row); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_hybrid( - size_type num_rows, size_type stride, size_type ell_max_nnz_per_row, - const ValueType *__restrict__ source_values, - const IndexType *__restrict__ source_row_ptrs, - const IndexType *__restrict__ source_col_idxs, - const size_type *__restrict__ coo_offset, - ValueType *__restrict__ result_ell_val, - IndexType *__restrict__ result_ell_col, - ValueType *__restrict__ result_coo_val, - IndexType *__restrict__ result_coo_col, - IndexType *__restrict__ result_coo_row) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - constexpr auto warp_size = cuda_config::warp_size; - const auto row = tidx / warp_size; - const auto local_tidx = tidx % warp_size; - - if (row < num_rows) { - for (size_type i = local_tidx; - i < source_row_ptrs[row + 1] - source_row_ptrs[row]; - i += warp_size) { - const auto source_idx = i + source_row_ptrs[row]; - if (i < ell_max_nnz_per_row) { - const auto result_idx = row + stride * i; - result_ell_val[result_idx] = source_values[source_idx]; - result_ell_col[result_idx] = source_col_idxs[source_idx]; - } else { - const auto result_idx = - coo_offset[row] + i - ell_max_nnz_per_row; - result_coo_val[result_idx] = source_values[source_idx]; - result_coo_col[result_idx] = source_col_idxs[source_idx]; - result_coo_row[result_idx] = row; - } - } - } -} - - -} // namespace kernel - - template void convert_to_hybrid(std::shared_ptr exec, - matrix::Hybrid *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Hybrid *result) { auto ell_val = result->get_ell_values(); auto ell_col = result->get_ell_col_idxs(); @@ -1482,17 +999,9 @@ void convert_to_hybrid(std::shared_ptr exec, num_rows, max_nnz_per_row, as_cuda_type(source->get_const_row_ptrs()), as_cuda_type(coo_offset.get_data())); - auto add_values = - Array(exec, ceildiv(num_rows, default_block_size)); - grid_dim = ceildiv(num_rows, default_block_size); - start_prefix_sum<<>>( - num_rows, as_cuda_type(coo_offset.get_data()), - as_cuda_type(add_values.get_data())); - finalize_prefix_sum<<>>( - num_rows, as_cuda_type(coo_offset.get_data()), - as_cuda_type(add_values.get_const_data())); - - grid_dim = ceildiv(num_rows * cuda_config::warp_size, default_block_size); + components::prefix_sum(exec, coo_offset.get_data(), num_rows); + + grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); kernel::fill_in_hybrid<<>>( num_rows, stride, max_nnz_per_row, as_cuda_type(source->get_const_values()), @@ -1527,7 +1036,46 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void sort_by_column_index(std::shared_ptr exec, matrix::Csr *to_sort) - GKO_NOT_IMPLEMENTED; +{ + if (cusparse::is_supported::value) { + auto handle = exec->get_cusparse_handle(); + auto descr = cusparse::create_mat_descr(); + auto m = IndexType(to_sort->get_size()[0]); + auto n = IndexType(to_sort->get_size()[1]); + auto nnz = IndexType(to_sort->get_num_stored_elements()); + auto row_ptrs = to_sort->get_const_row_ptrs(); + auto col_idxs = to_sort->get_col_idxs(); + auto vals = to_sort->get_values(); + + // copy values + Array tmp_vals_array(exec, nnz); + exec->copy(nnz, vals, tmp_vals_array.get_data()); + auto tmp_vals = tmp_vals_array.get_const_data(); + + // init identity permutation + Array permutation_array(exec, nnz); + auto permutation = permutation_array.get_data(); + cusparse::create_identity_permutation(handle, nnz, permutation); + + // allocate buffer + size_type buffer_size{}; + cusparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + buffer_size); + Array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + + // sort column indices + cusparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + permutation, buffer); + + // sort values + cusparse::gather(handle, nnz, tmp_vals, vals, permutation); + + cusparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); @@ -1536,8 +1084,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void is_sorted_by_column_index( std::shared_ptr exec, - const matrix::Csr *to_check, - bool *is_sorted) GKO_NOT_IMPLEMENTED; + const matrix::Csr *to_check, bool *is_sorted) +{ + *is_sorted = true; + auto cpu_array = Array::view(exec->get_master(), 1, is_sorted); + auto gpu_array = Array{exec, cpu_array}; + auto block_size = default_block_size; + auto num_rows = static_cast(to_check->get_size()[0]); + auto num_blocks = ceildiv(num_rows, block_size); + kernel::check_unsorted<<>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_rows, gpu_array.get_data()); + cpu_array = gpu_array; +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index a74c431599a..b694ba6b42d 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,11 +42,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/prefix_sum.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -64,6 +66,9 @@ namespace dense { constexpr auto default_block_size = 512; +#include "common/matrix/dense_kernels.hpp.inc" + + template void simple_apply(std::shared_ptr exec, const matrix::Dense *a, @@ -111,33 +116,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(block_size) void scale( - size_type num_rows, size_type num_cols, size_type num_alpha_cols, - const ValueType *__restrict__ alpha, ValueType *__restrict__ x, - size_type stride_x) -{ - constexpr auto warps_per_block = block_size / cuda_config::warp_size; - const auto global_id = - thread::get_thread_id(); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; - if (row_id < num_rows) { - x[row_id * stride_x + col_id] = - alpha[alpha_id] == zero() - ? zero() - : x[row_id * stride_x + col_id] * alpha[alpha_id]; - } -} - - -} // namespace kernel - - template void scale(std::shared_ptr exec, const matrix::Dense *alpha, matrix::Dense *x) @@ -151,8 +129,8 @@ void scale(std::shared_ptr exec, constexpr auto block_size = default_block_size; const dim3 grid_dim = ceildiv(x->get_size()[0] * x->get_size()[1], block_size); - const dim3 block_dim{cuda_config::warp_size, 1, - block_size / cuda_config::warp_size}; + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; kernel::scale<<>>( x->get_size()[0], x->get_size()[1], alpha->get_size()[1], as_cuda_type(alpha->get_const_values()), @@ -163,31 +141,6 @@ void scale(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(block_size) void add_scaled( - size_type num_rows, size_type num_cols, size_type num_alpha_cols, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ x, - size_type stride_x, ValueType *__restrict__ y, size_type stride_y) -{ - constexpr auto warps_per_block = block_size / cuda_config::warp_size; - const auto global_id = - thread::get_thread_id(); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; - if (row_id < num_rows && alpha[alpha_id] != zero()) { - y[row_id * stride_y + col_id] += - x[row_id * stride_x + col_id] * alpha[alpha_id]; - } -} - - -} // namespace kernel - - template void add_scaled(std::shared_ptr exec, const matrix::Dense *alpha, @@ -202,8 +155,8 @@ void add_scaled(std::shared_ptr exec, constexpr auto block_size = default_block_size; const dim3 grid_dim = ceildiv(x->get_size()[0] * x->get_size()[1], block_size); - const dim3 block_dim{cuda_config::warp_size, 1, - block_size / cuda_config::warp_size}; + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; kernel::add_scaled<<>>( x->get_size()[0], x->get_size()[1], alpha->get_size()[1], as_cuda_type(alpha->get_const_values()), @@ -215,63 +168,6 @@ void add_scaled(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(block_size) void compute_partial_dot( - size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, - const ValueType *__restrict__ y, size_type stride_y, - ValueType *__restrict__ work) -{ - constexpr auto warps_per_block = block_size / cuda_config::warp_size; - - const auto num_blocks = gridDim.x; - const auto local_id = thread::get_local_thread_id(); - const auto global_id = - thread::get_thread_id(); - - auto tmp = zero(); - for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { - tmp += x[i * stride_x] * y[i * stride_y]; - } - __shared__ UninitializedArray tmp_work; - tmp_work[local_id] = tmp; - - reduce(group::this_thread_block(), static_cast(tmp_work), - [](const ValueType &x, const ValueType &y) { return x + y; }); - - if (local_id == 0) { - work[thread::get_block_id()] = tmp_work[0]; - } -} - - -template -__global__ __launch_bounds__(block_size) void finalize_dot_computation( - size_type size, const ValueType *work, ValueType *result) -{ - const auto local_id = thread::get_local_thread_id(); - - ValueType tmp = zero(); - for (auto i = local_id; i < size; i += block_size) { - tmp += work[i]; - } - __shared__ UninitializedArray tmp_work; - tmp_work[local_id] = tmp; - - reduce(group::this_thread_block(), static_cast(tmp_work), - [](const ValueType &x, const ValueType &y) { return x + y; }); - - if (local_id == 0) { - *result = tmp_work[0]; - } -} - - -} // namespace kernel - - template void compute_dot(std::shared_ptr exec, const matrix::Dense *x, @@ -295,8 +191,8 @@ void compute_dot(std::shared_ptr exec, constexpr auto work_per_block = work_per_thread * block_size; const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{cuda_config::warp_size, 1, - block_size / cuda_config::warp_size}; + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; Array work(exec, grid_dim.x); // TODO: write a kernel which does this more efficiently for (size_type col = 0; col < x->get_size()[1]; ++col) { @@ -311,32 +207,13 @@ void compute_dot(std::shared_ptr exec, } } - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void compute_sqrt( - size_type num_cols, ValueType *__restrict__ work) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - if (tidx < num_cols) { - work[tidx] = sqrt(abs(work[tidx])); - } -} - - -} // namespace kernel - - template void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, - matrix::Dense *result) + matrix::Dense> *result) { if (cublas::is_supported::value) { for (size_type col = 0; col < x->get_size()[1]; ++col) { @@ -345,51 +222,37 @@ void compute_norm2(std::shared_ptr exec, result->get_values() + col); } } else { - compute_dot(exec, x, x, result); - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(result->get_size()[1], block_size.x), 1, - 1); - kernel::compute_sqrt<<>>( - result->get_size()[1], as_cuda_type(result->get_values())); - } -} - - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - -namespace kernel { - + using norm_type = remove_complex; + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + constexpr auto block_size = 1024; -template -__global__ __launch_bounds__(default_block_size) void fill_in_coo( - size_type num_rows, size_type num_cols, size_type stride, - const size_type *__restrict__ row_ptrs, - const ValueType *__restrict__ source, IndexType *__restrict__ row_idxs, - IndexType *__restrict__ col_idxs, ValueType *__restrict__ values) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - if (tidx < num_rows) { - size_type write_to = row_ptrs[tidx]; - - for (size_type i = 0; i < num_cols; i++) { - if (source[stride * tidx + i] != zero()) { - values[write_to] = source[stride * tidx + i]; - col_idxs[write_to] = i; - row_idxs[write_to] = tidx; - write_to++; - } + constexpr auto work_per_block = work_per_thread * block_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_norm2<<>>( + x->get_size()[0], as_cuda_type(x->get_const_values() + col), + x->get_stride(), as_cuda_type(work.get_data())); + kernel::finalize_norm2_computation<<<1, block_dim>>>( + grid_dim.x, as_cuda_type(work.get_const_data()), + as_cuda_type(result->get_values() + col)); } } } - -} // namespace kernel +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); template void convert_to_coo(std::shared_ptr exec, - matrix::Coo *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Coo *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -403,88 +266,25 @@ void convert_to_coo(std::shared_ptr exec, auto nnz_prefix_sum = Array(exec, num_rows); calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); - const size_type grid_dim = ceildiv(num_rows, default_block_size); - auto add_values = Array(exec, grid_dim); + components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); - start_prefix_sum<<>>( - num_rows, as_cuda_type(nnz_prefix_sum.get_data()), - as_cuda_type(add_values.get_data())); - - finalize_prefix_sum<<>>( - num_rows, as_cuda_type(nnz_prefix_sum.get_data()), - as_cuda_type(add_values.get_data())); + size_type grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_coo<<>>( num_rows, num_cols, stride, as_cuda_type(nnz_prefix_sum.get_const_data()), as_cuda_type(source->get_const_values()), as_cuda_type(row_idxs), as_cuda_type(col_idxs), as_cuda_type(values)); - - nnz_prefix_sum.clear(); - add_values.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ work, IndexType *__restrict__ result) -{ - constexpr auto warp_size = cuda_config::warp_size; - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - const auto row_idx = tidx / warp_size; - - if (row_idx < num_rows) { - IndexType part_result{}; - for (auto i = threadIdx.x % warp_size; i < num_cols; i += warp_size) { - if (work[stride * row_idx + i] != zero()) { - part_result += 1; - } - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - result[row_idx] = reduce( - warp_tile, part_result, - [](const size_type &a, const size_type &b) { return a + b; }); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs, - IndexType *__restrict__ col_idxs, ValueType *__restrict__ values) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - - if (tidx < num_rows) { - auto write_to = row_ptrs[tidx]; - for (auto i = 0; i < num_cols; i++) { - if (source[stride * tidx + i] != zero()) { - values[write_to] = source[stride * tidx + i]; - col_idxs[write_to] = i; - write_to++; - } - } - } -} - - -} // namespace kernel - - template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -495,77 +295,30 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = - ceildiv(default_block_size, cuda_config::warp_size); + const auto rows_per_block = ceildiv(default_block_size, config::warp_size); const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); kernel::count_nnz_per_row<<>>( num_rows, num_cols, stride, as_cuda_type(source->get_const_values()), as_cuda_type(row_ptrs)); - size_type grid_dim = ceildiv(num_rows + 1, default_block_size); - auto add_values = Array(exec, grid_dim); - - start_prefix_sum - <<>>(num_rows + 1, as_cuda_type(row_ptrs), - as_cuda_type(add_values.get_data())); + components::prefix_sum(exec, row_ptrs, num_rows + 1); - finalize_prefix_sum<<>>( - num_rows + 1, as_cuda_type(row_ptrs), - as_cuda_type(add_values.get_const_data())); + size_type grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_csr<<>>( num_rows, num_cols, stride, as_cuda_type(source->get_const_values()), as_cuda_type(row_ptrs), as_cuda_type(col_idxs), as_cuda_type(values)); - - add_values.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_ell( - size_type num_rows, size_type num_cols, size_type source_stride, - const ValueType *__restrict__ source, size_type max_nnz_per_row, - size_type result_stride, IndexType *__restrict__ col_ptrs, - ValueType *__restrict__ values) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - if (tidx < num_rows) { - IndexType col_idx = 0; - for (size_type col = 0; col < num_cols; col++) { - if (source[tidx * source_stride + col] != zero()) { - col_ptrs[col_idx * result_stride + tidx] = col; - values[col_idx * result_stride + tidx] = - source[tidx * source_stride + col]; - col_idx++; - } - } - for (size_type j = col_idx; j < max_nnz_per_row; j++) { - col_ptrs[j * result_stride + tidx] = 0; - values[j * result_stride + tidx] = zero(); - } - } else if (tidx < result_stride) { - for (size_type j = 0; j < max_nnz_per_row; j++) { - col_ptrs[j * result_stride + tidx] = 0; - values[j * result_stride + tidx] = zero(); - } - } -} - - -} // namespace kernel - - template void convert_to_ell(std::shared_ptr exec, - matrix::Ell *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Ell *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -590,93 +343,18 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, - matrix::Hybrid *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Hybrid *result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); -namespace kernel { - - -__global__ - __launch_bounds__(cuda_config::warp_size) void calculate_slice_lengths( - size_type num_rows, size_type slice_size, int slice_num, - size_type stride_factor, const size_type *__restrict__ nnz_per_row, - size_type *__restrict__ slice_lengths, - size_type *__restrict__ slice_sets) -{ - constexpr auto warp_size = cuda_config::warp_size; - const auto sliceid = blockIdx.x; - const auto tid_in_warp = threadIdx.x; - - if (sliceid * slice_size + tid_in_warp < num_rows) { - size_type thread_result = 0; - for (auto i = tid_in_warp; i < slice_size; i += warp_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - auto warp_result = reduce( - warp_tile, thread_result, - [](const size_type &a, const size_type &b) { return max(a, b); }); - - if (tid_in_warp == 0) { - auto slice_length = - ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_sellp( - size_type num_rows, size_type num_cols, size_type slice_size, - size_type stride, const ValueType *__restrict__ source, - size_type *__restrict__ slice_lengths, size_type *__restrict__ slice_sets, - IndexType *__restrict__ col_idxs, ValueType *__restrict__ vals) -{ - const auto global_row = threadIdx.x + blockIdx.x * blockDim.x; - const auto row = global_row % slice_size; - const auto sliceid = global_row / slice_size; - - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[sliceid] * slice_size + row; - - for (size_type col = 0; col < num_cols; col++) { - auto val = source[global_row * stride + col]; - if (val != zero()) { - col_idxs[sellp_ind] = col; - vals[sellp_ind] = val; - sellp_ind += slice_size; - } - } - for (size_type i = sellp_ind; - i < - (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; - i += slice_size) { - col_idxs[i] = 0; - vals[i] = zero(); - } - } -} - - -} // namespace kernel - - template void convert_to_sellp(std::shared_ptr exec, - matrix::Sellp *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Sellp *result) { const auto stride = source->get_stride(); const auto num_rows = result->get_size()[0]; @@ -700,31 +378,18 @@ void convert_to_sellp(std::shared_ptr exec, auto grid_dim = slice_num; - kernel::calculate_slice_lengths<<>>( + kernel::calculate_slice_lengths<<>>( num_rows, slice_size, slice_num, stride_factor, as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths), as_cuda_type(slice_sets)); - auto add_values = - Array(exec, ceildiv(slice_num + 1, default_block_size)); - grid_dim = ceildiv(slice_num + 1, default_block_size); - - start_prefix_sum<<>>( - slice_num + 1, as_cuda_type(slice_sets), - as_cuda_type(add_values.get_data())); - - finalize_prefix_sum<<>>( - slice_num + 1, as_cuda_type(slice_sets), - as_cuda_type(add_values.get_const_data())); + components::prefix_sum(exec, slice_sets, slice_num + 1); grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_sellp<<>>( num_rows, num_cols, slice_size, stride, as_cuda_type(source->get_const_values()), as_cuda_type(slice_lengths), as_cuda_type(slice_sets), as_cuda_type(col_idxs), as_cuda_type(vals)); - - add_values.clear(); - nnz_per_row.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -733,8 +398,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sparsity_csr(std::shared_ptr exec, - matrix::SparsityCsr *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::SparsityCsr *result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -751,34 +416,11 @@ void count_nonzeros(std::shared_ptr exec, calculate_nonzeros_per_row(exec, source, &nnz_per_row); *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); - nnz_per_row.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); -namespace kernel { - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz( - size_type size, const size_type *__restrict__ nnz_per_row, - size_type *__restrict__ result) -{ - extern __shared__ size_type block_max[]; - - reduce_array( - size, nnz_per_row, block_max, - [](const size_type &x, const size_type &y) { return max(x, y); }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_max[0]; - } -} - - -} // namespace kernel - - template void calculate_max_nnz_per_row(std::shared_ptr exec, const matrix::Dense *source, @@ -807,11 +449,7 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, grid_dim, as_cuda_type(block_results.get_const_data()), as_cuda_type(d_result.get_data())); - exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(), - result); - d_result.clear(); - block_results.clear(); - nnz_per_row.clear(); + *result = exec->copy_val_to_host(d_result.get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -824,7 +462,7 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, Array *result) { const dim3 block_size(default_block_size, 1, 1); - auto rows_per_block = ceildiv(default_block_size, cuda_config::warp_size); + auto rows_per_block = ceildiv(default_block_size, config::warp_size); const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); const dim3 grid_size(grid_x, 1, 1); kernel::count_nnz_per_row<<>>( @@ -837,57 +475,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); -namespace kernel { - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice( - size_type num_rows, size_type slice_size, size_type stride_factor, - const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - constexpr auto warp_size = cuda_config::warp_size; - const auto warpid = tidx / warp_size; - const auto tid_in_warp = tidx % warp_size; - const auto slice_num = ceildiv(num_rows, slice_size); - - size_type thread_result = 0; - for (auto i = tid_in_warp; i < slice_size; i += warp_size) { - if (warpid * slice_size + i < num_rows) { - thread_result = - max(thread_result, nnz_per_row[warpid * slice_size + i]); - } - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - auto warp_result = reduce( - warp_tile, thread_result, - [](const size_type &a, const size_type &b) { return max(a, b); }); - - if (tid_in_warp == 0 && warpid < slice_num) { - result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_total_cols( - size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, - size_type *__restrict__ result) -{ - extern __shared__ size_type block_result[]; - - reduce_array(num_slices, max_nnz_per_slice, block_result, - [](const size_type &x, const size_type &y) { return x + y; }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_result[0]; - } -} - - -} // namespace kernel - - template void calculate_total_cols(std::shared_ptr exec, const matrix::Dense *source, @@ -904,8 +491,7 @@ void calculate_total_cols(std::shared_ptr exec, auto max_nnz_per_slice = Array(exec, slice_num); - auto grid_dim = - ceildiv(slice_num * cuda_config::warp_size, default_block_size); + auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); kernel::reduce_max_nnz_per_slice<<>>( num_rows, slice_size, stride_factor, @@ -927,13 +513,7 @@ void calculate_total_cols(std::shared_ptr exec, grid_dim, as_cuda_type(block_results.get_const_data()), as_cuda_type(d_result.get_data())); - exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(), - result); - - block_results.clear(); - nnz_per_row.clear(); - max_nnz_per_slice.clear(); - d_result.clear(); + *result = exec->copy_val_to_host(d_result.get_const_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -942,8 +522,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void transpose(std::shared_ptr exec, - matrix::Dense *trans, - const matrix::Dense *orig) + const matrix::Dense *orig, + matrix::Dense *trans) { if (cublas::is_supported::value) { auto handle = exec->get_cublas_handle(); @@ -967,9 +547,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, - matrix::Dense *trans, - const matrix::Dense *orig) - + const matrix::Dense *orig, + matrix::Dense *trans) { if (cublas::is_supported::value) { auto handle = exec->get_cublas_handle(); @@ -986,11 +565,96 @@ void conj_transpose(std::shared_ptr exec, } else { GKO_NOT_IMPLEMENTED; } -}; +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL); +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + kernel::row_permute<<>>( + orig->get_size()[0], orig->get_size()[1], + as_cuda_type(permutation_indices->get_const_data()), + as_cuda_type(orig->get_const_values()), orig->get_stride(), + as_cuda_type(row_permuted->get_values()), row_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + kernel::column_permute<<>>( + orig->get_size()[0], orig->get_size()[1], + as_cuda_type(permutation_indices->get_const_data()), + as_cuda_type(orig->get_const_values()), orig->get_stride(), + as_cuda_type(column_permuted->get_values()), + column_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + kernel::inverse_row_permute<<>>( + orig->get_size()[0], orig->get_size()[1], + as_cuda_type(permutation_indices->get_const_data()), + as_cuda_type(orig->get_const_values()), orig->get_stride(), + as_cuda_type(row_permuted->get_values()), row_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + kernel::inverse_column_permute<<>>( + orig->get_size()[0], orig->get_size()[1], + as_cuda_type(permutation_indices->get_const_data()), + as_cuda_type(orig->get_const_values()), orig->get_stride(), + as_cuda_type(column_permuted->get_values()), + column_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL); + + } // namespace dense } // namespace cuda } // namespace kernels diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 76abffe5859..aded7cb11ad 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,16 +43,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array.hpp" +#include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/format_conversion.cuh" -#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" -#include "cuda/components/zero_array.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -75,6 +77,8 @@ constexpr int default_block_size = 512; * `num_threads_per_core` threads assigned to each physical core. */ constexpr int num_threads_per_core = 4; + + /** * ratio is the parameter to decide when to use threads to do reduction on each * row. (#cols/#rows > ratio) @@ -82,128 +86,30 @@ constexpr int num_threads_per_core = 4; constexpr double ratio = 1e-2; +/** + * max_thread_per_worker is the max number of thread per worker. The + * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> + */ +constexpr int max_thread_per_worker = 32; + + /** * A compile-time list of sub-warp sizes for which the spmv kernels should be * compiled. * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = - syn::value_list; - +using compiled_kernels = syn::value_list; -namespace kernel { -namespace { - -template -__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row, - const ValueType *__restrict__ val, - const IndexType *__restrict__ col, - const size_type stride, - const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, - const size_type b_stride, ValueType *__restrict__ c, - const size_type c_stride, Closure op) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const IndexType x = tidx / subwarp_size / nwarps_per_row; - const auto warp_id = tidx / subwarp_size % nwarps_per_row; - const auto y_start = tidx % subwarp_size + - num_stored_elements_per_row * warp_id / nwarps_per_row; - const auto y_end = - num_stored_elements_per_row * (warp_id + 1) / nwarps_per_row; - if (x < num_rows) { - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - ValueType temp = zero(); - const auto column_id = blockIdx.y; - for (IndexType idx = y_start; idx < y_end; idx += subwarp_size) { - const auto ind = x + idx * stride; - const auto col_idx = col[ind]; - if (col_idx < idx) { - break; - } else { - temp += val[ind] * b[col_idx * b_stride + column_id]; - } - } - const auto answer = reduce( - tile_block, temp, [](ValueType x, ValueType y) { return x + y; }); - if (tile_block.thread_rank() == 0) { - if (atomic) { - atomic_add(&(c[x * c_stride + column_id]), - op(answer, c[x * c_stride + column_id])); - } else { - c[x * c_stride + column_id] = - op(answer, c[x * c_stride + column_id]); - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const int nwarps_per_row, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) -{ - spmv_kernel( - num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row, - b, b_stride, c, c_stride, - [](const ValueType &x, const ValueType &y) { return x; }); -} - - -template -__global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const int nwarps_per_row, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, - const IndexType *__restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - const ValueType *__restrict__ beta, ValueType *__restrict__ c, - const size_type c_stride) -{ - const ValueType alpha_val = alpha[0]; - const ValueType beta_val = beta[0]; - // Because the atomic operation changes the values of c during computation, - // it can not do the right alpha * a * b + beta * c operation. - // Thus, the cuda kernel only computes alpha * a * b when it uses atomic - // operation. - if (atomic) { - spmv_kernel( - num_rows, nwarps_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val](const ValueType &x, const ValueType &y) { - return alpha_val * x; - }); - } else { - spmv_kernel( - num_rows, nwarps_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { - return alpha_val * x + beta_val * y; - }); - } -} - - -} // namespace -} // namespace kernel +#include "common/matrix/ell_kernels.hpp.inc" namespace { template -void abstract_spmv(syn::value_list, int nwarps_per_row, +void abstract_spmv(syn::value_list, int num_worker_per_row, const matrix::Ell *a, const matrix::Dense *b, matrix::Dense *c, @@ -211,27 +117,31 @@ void abstract_spmv(syn::value_list, int nwarps_per_row, const matrix::Dense *beta = nullptr) { const auto nrows = a->get_size()[0]; - constexpr int subwarp_size = (info == 0) ? cuda_config::warp_size : info; + constexpr int num_thread_per_worker = + (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size( - ceildiv(nrows * subwarp_size * nwarps_per_row, block_size.x), - b->get_size()[1], 1); + const dim3 block_size(default_block_size / num_thread_per_worker, + num_thread_per_worker, 1); + const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), + b->get_size()[1], 1); if (alpha == nullptr && beta == nullptr) { - kernel::spmv<<>>( - nrows, nwarps_per_row, as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), a->get_stride(), - a->get_num_stored_elements_per_row(), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(c->get_values()), c->get_stride()); + kernel::spmv + <<>>( + nrows, num_worker_per_row, as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), a->get_stride(), + a->get_num_stored_elements_per_row(), + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { - kernel::spmv<<>>( - nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(beta->get_const_values()), - as_cuda_type(c->get_values()), c->get_stride()); + kernel::spmv + <<>>( + nrows, num_worker_per_row, + as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), + a->get_stride(), a->get_num_stored_elements_per_row(), + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(beta->get_const_values()), + as_cuda_type(c->get_values()), c->get_stride()); } else { GKO_KERNEL_NOT_FOUND; } @@ -241,42 +151,43 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv); template -std::array compute_subwarp_size_and_atomicity( +std::array compute_thread_worker_and_atomicity( std::shared_ptr exec, const matrix::Ell *a) { - int subwarp_size = 1; + int num_thread_per_worker = 1; int atomic = 0; - int nwarps_per_row = 1; + int num_worker_per_row = 1; const auto nrows = a->get_size()[0]; const auto ell_ncols = a->get_num_stored_elements_per_row(); - const auto nwarps = exec->get_num_cores_per_sm() / cuda_config::warp_size * + // TODO: num_threads_per_core should be tuned for AMD gpu + const auto nwarps = exec->get_num_warps_per_sm() * exec->get_num_multiprocessor() * num_threads_per_core; // Use multithreads to perform the reduction on each row when the matrix is // wide. // To make every thread have computation, so pick the value which is the - // power of 2 less than warp_size and is less than or equal to ell_ncols. If - // the subwarp_size is warp_size and allow more than one warps to work on - // the same row, use atomic add to handle the warps write the value into the - // same position. The #warps is decided according to the number of warps - // allowed on GPU. + // power of 2 less than max_thread_per_worker and is less than or equal to + // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and + // allow more than one worker to work on the same row, use atomic add to + // handle the worker write the value into the same position. The #worker is + // decided according to the number of worker allowed on GPU. if (static_cast(ell_ncols) / nrows > ratio) { - while (subwarp_size < cuda_config::warp_size && - (subwarp_size << 1) <= ell_ncols) { - subwarp_size <<= 1; + while (num_thread_per_worker < max_thread_per_worker && + (num_thread_per_worker << 1) <= ell_ncols) { + num_thread_per_worker <<= 1; } - if (subwarp_size == cuda_config::warp_size) { - nwarps_per_row = - std::min(ell_ncols / cuda_config::warp_size, nwarps / nrows); - nwarps_per_row = std::max(nwarps_per_row, 1); + if (num_thread_per_worker == max_thread_per_worker) { + num_worker_per_row = + std::min(ell_ncols / max_thread_per_worker, nwarps / nrows); + num_worker_per_row = std::max(num_worker_per_row, 1); } - if (nwarps_per_row > 1) { + if (num_worker_per_row > 1) { atomic = 1; } } - return {subwarp_size, atomic, nwarps_per_row}; + return {num_thread_per_worker, atomic, num_worker_per_row}; } @@ -288,24 +199,26 @@ void spmv(std::shared_ptr exec, const matrix::Ell *a, const matrix::Dense *b, matrix::Dense *c) { - const auto data = compute_subwarp_size_and_atomicity(exec, a); - const int subwarp_size = std::get<0>(data); + const auto data = compute_thread_worker_and_atomicity(exec, a); + const int num_thread_per_worker = std::get<0>(data); const int atomic = std::get<1>(data); - const int nwarps_per_row = std::get<2>(data); + const int num_worker_per_row = std::get<2>(data); /** * info is the parameter for selecting the cuda kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ - const int info = (!atomic) * subwarp_size; + const int info = (!atomic) * num_thread_per_worker; if (atomic) { - zero_array(c->get_num_stored_elements(), c->get_values()); + components::fill_array(exec, c->get_values(), + c->get_num_stored_elements(), zero()); } select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), nwarps_per_row, a, b, c); + syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, + c); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); @@ -319,24 +232,24 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense *beta, matrix::Dense *c) { - const auto data = compute_subwarp_size_and_atomicity(exec, a); - const int subwarp_size = std::get<0>(data); + const auto data = compute_thread_worker_and_atomicity(exec, a); + const int num_thread_per_worker = std::get<0>(data); const int atomic = std::get<1>(data); - const int nwarps_per_row = std::get<2>(data); + const int num_worker_per_row = std::get<2>(data); /** * info is the parameter for selecting the cuda kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ - const int info = (!atomic) * subwarp_size; + const int info = (!atomic) * num_thread_per_worker; if (atomic) { dense::scale(exec, beta, c); } select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), nwarps_per_row, a, b, c, + syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, c, alpha, beta); } @@ -344,48 +257,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -namespace kernel { - - -template -__global__ - __launch_bounds__(cuda_config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type num_rows, size_type nnz, size_type source_stride, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, size_type result_stride, - ValueType *__restrict__ result) -{ - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - if (tidx < num_rows) { - for (auto col = 0; col < nnz; col++) { - result[tidx * result_stride + - col_idxs[tidx + col * source_stride]] += - values[tidx + col * source_stride]; - } - } -} - - -} // namespace kernel - - template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Ell *source) + const matrix::Ell *source, + matrix::Dense *result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -394,9 +269,8 @@ void convert_to_dense(std::shared_ptr exec, const auto vals = source->get_const_values(); const auto source_stride = source->get_stride(); - const dim3 block_size(cuda_config::warp_size, - cuda_config::max_block_size / cuda_config::warp_size, - 1); + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); const dim3 init_grid_dim(ceildiv(result_stride, block_size.x), ceildiv(num_rows, block_size.y), 1); kernel::initialize_zero_dense<<>>( @@ -413,68 +287,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( - size_type num_rows, size_type max_nnz_per_row, size_type stride, - const ValueType *__restrict__ values, IndexType *__restrict__ result) -{ - constexpr auto warp_size = cuda_config::warp_size; - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - const auto row_idx = tidx / warp_size; - - if (row_idx < num_rows) { - IndexType part_result{}; - for (auto i = threadIdx.x % warp_size; i < max_nnz_per_row; - i += warp_size) { - if (values[stride * i + row_idx] != zero()) { - part_result += 1; - } - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - result[row_idx] = reduce( - warp_tile, part_result, - [](const size_type &a, const size_type &b) { return a + b; }); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type max_nnz_per_row, size_type stride, - const ValueType *__restrict__ source_values, - const IndexType *__restrict__ source_col_idxs, - IndexType *__restrict__ result_row_ptrs, - IndexType *__restrict__ result_col_idxs, - ValueType *__restrict__ result_values) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - - if (tidx < num_rows) { - auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { - const auto source_idx = tidx + stride * i; - if (source_values[source_idx] != zero()) { - result_values[write_to] = source_values[source_idx]; - result_col_idxs[write_to] = source_col_idxs[source_idx]; - write_to++; - } - } - } -} - - -} // namespace kernel - - template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Ell *source) + const matrix::Ell *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; @@ -486,31 +302,22 @@ void convert_to_csr(std::shared_ptr exec, const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); constexpr auto rows_per_block = - ceildiv(default_block_size, cuda_config::warp_size); + ceildiv(default_block_size, config::warp_size); const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); kernel::count_nnz_per_row<<>>( num_rows, max_nnz_per_row, stride, as_cuda_type(source->get_const_values()), as_cuda_type(row_ptrs)); - size_type grid_dim = ceildiv(num_rows + 1, default_block_size); - auto add_values = Array(exec, grid_dim); + components::prefix_sum(exec, row_ptrs, num_rows + 1); - start_prefix_sum - <<>>(num_rows + 1, as_cuda_type(row_ptrs), - as_cuda_type(add_values.get_data())); - - finalize_prefix_sum<<>>( - num_rows + 1, as_cuda_type(row_ptrs), - as_cuda_type(add_values.get_const_data())); + size_type grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_csr<<>>( num_rows, max_nnz_per_row, stride, as_cuda_type(source->get_const_values()), as_cuda_type(source->get_const_col_idxs()), as_cuda_type(row_ptrs), as_cuda_type(col_idxs), as_cuda_type(values)); - - add_values.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -528,7 +335,6 @@ void count_nonzeros(std::shared_ptr exec, calculate_nonzeros_per_row(exec, source, &nnz_per_row); *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); - nnz_per_row.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -545,7 +351,7 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, const auto stride = source->get_stride(); const auto values = source->get_const_values(); - const auto warp_size = cuda_config::warp_size; + const auto warp_size = config::warp_size; const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size); kernel::count_nnz_per_row<<>>( diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu index 92519dc02ba..7b731559672 100644 --- a/cuda/matrix/hybrid_kernels.cu +++ b/cuda/matrix/hybrid_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -37,16 +37,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array.hpp" +#include "core/components/prefix_sum.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/ell_kernels.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/format_conversion.cuh" -#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/segment_scan.cuh" -#include "cuda/components/zero_array.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -64,131 +66,22 @@ constexpr int default_block_size = 512; constexpr int warps_in_block = 4; -template -void convert_to_dense( - std::shared_ptr exec, matrix::Dense *result, - const matrix::Hybrid *source) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); - - -namespace kernel { - - -/** - * The global function for counting the number of nonzeros per row of COO. - * It is almost like COO spmv routine. - * It performs is_nonzeros(Coo) times the vector whose values are one - * - * @param nnz the number of nonzeros in the matrix - * @param num_line the maximum round of each warp - * @param val the value array of the matrix - * @param row the row index array of the matrix - * @param nnz_per_row the output nonzeros per row - */ -template -__global__ __launch_bounds__(default_block_size) void count_coo_row_nnz( - const size_type nnz, const size_type num_lines, - const ValueType *__restrict__ val, const IndexType *__restrict__ row, - IndexType *__restrict__ nnz_per_row) -{ - IndexType temp_val = 0; - const auto start = static_cast(blockDim.x) * blockIdx.x * - blockDim.y * num_lines + - threadIdx.y * blockDim.x * num_lines; - size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size); - num = min(num, num_lines); - const IndexType ind_start = start + threadIdx.x; - const IndexType ind_end = ind_start + (num - 1) * subwarp_size; - IndexType ind = ind_start; - IndexType curr_row = (ind < nnz) ? row[ind] : 0; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - for (; ind < ind_end; ind += subwarp_size) { - temp_val += ind < nnz && val[ind] != zero(); - auto next_row = - (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; - // segmented scan - if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(nnz_per_row[curr_row]), temp_val); - } - temp_val = 0; - } - curr_row = next_row; - } - if (num > 0) { - ind = ind_end; - temp_val += ind < nnz && val[ind] != zero(); - // segmented scan - - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(nnz_per_row[curr_row]), temp_val); - } - } -} +#include "common/matrix/hybrid_kernels.hpp.inc" template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type max_nnz_per_row, size_type stride, - const ValueType *__restrict__ ell_val, - const IndexType *__restrict__ ell_col, - const ValueType *__restrict__ coo_val, - const IndexType *__restrict__ coo_col, - const IndexType *__restrict__ coo_offset, - IndexType *__restrict__ result_row_ptrs, - IndexType *__restrict__ result_col_idxs, - ValueType *__restrict__ result_values) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - - if (tidx < num_rows) { - auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { - const auto source_idx = tidx + stride * i; - if (ell_val[source_idx] != zero()) { - result_values[write_to] = ell_val[source_idx]; - result_col_idxs[write_to] = ell_col[source_idx]; - write_to++; - } - } - for (auto i = coo_offset[tidx]; i < coo_offset[tidx + 1]; i++) { - if (coo_val[i] != zero()) { - result_values[write_to] = coo_val[i]; - result_col_idxs[write_to] = coo_col[i]; - write_to++; - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void add( - size_type num, ValueType1 *__restrict__ val1, - const ValueType2 *__restrict__ val2) -{ - const auto tidx = threadIdx.x + blockDim.x * blockIdx.x; - if (tidx < num) { - val1[tidx] += val2[tidx]; - } -} - +void convert_to_dense(std::shared_ptr exec, + const matrix::Hybrid *source, + matrix::Dense *result) GKO_NOT_IMPLEMENTED; -} // namespace kernel +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Hybrid *source) + const matrix::Hybrid *source, + matrix::Csr *result) { const auto num_rows = source->get_size()[0]; auto coo_offset = Array(exec, num_rows + 1); @@ -211,20 +104,21 @@ void convert_to_csr(std::shared_ptr exec, auto row_ptrs = result->get_row_ptrs(); auto coo_row_ptrs = Array(exec, num_rows); - zero_array(num_rows + 1, row_ptrs); + components::fill_array(exec, row_ptrs, num_rows + 1, zero()); grid_num = ceildiv(num_rows, warps_in_block); ell::kernel::count_nnz_per_row<<>>( num_rows, max_nnz_per_row, stride, as_cuda_type(ell_val), as_cuda_type(row_ptrs)); - zero_array(num_rows, coo_row_ptrs.get_data()); + components::fill_array(exec, coo_row_ptrs.get_data(), num_rows, + zero()); auto nwarps = coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements); if (nwarps > 0) { int num_lines = - ceildiv(coo_num_stored_elements, nwarps * cuda_config::warp_size); - const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1); + ceildiv(coo_num_stored_elements, nwarps * config::warp_size); + const dim3 coo_block(config::warp_size, warps_in_block, 1); const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); kernel::count_coo_row_nnz<<>>( @@ -236,16 +130,7 @@ void convert_to_csr(std::shared_ptr exec, num_rows, as_cuda_type(row_ptrs), as_cuda_type(coo_row_ptrs.get_const_data())); - grid_num = ceildiv(num_rows + 1, default_block_size); - auto add_values = Array(exec, grid_num); - - start_prefix_sum - <<>>(num_rows + 1, as_cuda_type(row_ptrs), - as_cuda_type(add_values.get_data())); - - finalize_prefix_sum<<>>( - num_rows + 1, as_cuda_type(row_ptrs), - as_cuda_type(add_values.get_const_data())); + components::prefix_sum(exec, row_ptrs, num_rows + 1); // Fill the value grid_num = ceildiv(num_rows, default_block_size); @@ -273,12 +158,13 @@ void count_nonzeros(std::shared_ptr exec, auto nnz = source->get_coo_num_stored_elements(); auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz); if (nwarps > 0) { - int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size); - const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1); + int num_lines = ceildiv(nnz, nwarps * config::warp_size); + const dim3 coo_block(config::warp_size, warps_in_block, 1); const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); const auto num_rows = source->get_size()[0]; auto nnz_per_row = Array(exec, num_rows); - zero_array(num_rows, nnz_per_row.get_data()); + components::fill_array(exec, nnz_per_row.get_data(), num_rows, + zero()); kernel::count_coo_row_nnz<<>>( nnz, num_lines, as_cuda_type(source->get_coo()->get_const_values()), as_cuda_type(source->get_coo()->get_const_row_idxs()), diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index 1f5a058ac9c..175fb65e078 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,10 +40,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/prefix_sum.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/types.hpp" -#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -57,37 +59,10 @@ namespace cuda { namespace sellp { -namespace { - constexpr auto default_block_size = 512; -template -__global__ __launch_bounds__(matrix::default_slice_size) void spmv_kernel( - size_type num_rows, size_type num_right_hand_sides, size_type b_stride, - size_type c_stride, const size_type *__restrict__ slice_lengths, - const size_type *__restrict__ slice_sets, const ValueType *__restrict__ a, - const IndexType *__restrict__ col, const ValueType *__restrict__ b, - ValueType *__restrict__ c) -{ - const auto slice_id = blockIdx.x; - const auto slice_size = blockDim.x; - const auto row_in_slice = threadIdx.x; - const auto global_row = - static_cast(slice_size) * slice_id + row_in_slice; - const auto column_id = blockIdx.y; - ValueType val = 0; - IndexType ind = 0; - if (global_row < num_rows && column_id < num_right_hand_sides) { - for (size_type i = 0; i < slice_lengths[slice_id]; i++) { - ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; - val += a[ind] * b[col[ind] * b_stride + column_id]; - } - c[global_row * c_stride + column_id] = val; - } -} - -} // namespace +#include "common/matrix/sellp_kernels.hpp.inc" template @@ -109,41 +84,6 @@ void spmv(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); -namespace { - - -template -__global__ - __launch_bounds__(matrix::default_slice_size) void advanced_spmv_kernel( - size_type num_rows, size_type num_right_hand_sides, size_type b_stride, - size_type c_stride, const size_type *__restrict__ slice_lengths, - const size_type *__restrict__ slice_sets, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ a, - const IndexType *__restrict__ col, const ValueType *__restrict__ b, - const ValueType *__restrict__ beta, ValueType *__restrict__ c) -{ - const auto slice_id = blockIdx.x; - const auto slice_size = blockDim.x; - const auto row_in_slice = threadIdx.x; - const auto global_row = - static_cast(slice_size) * slice_id + row_in_slice; - const auto column_id = blockIdx.y; - ValueType val = 0; - IndexType ind = 0; - if (global_row < num_rows && column_id < num_right_hand_sides) { - for (size_type i = 0; i < slice_lengths[slice_id]; i++) { - ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; - val += alpha[0] * a[ind] * b[col[ind] * b_stride + column_id]; - } - c[global_row * c_stride + column_id] = - beta[0] * c[global_row * c_stride + column_id] + val; - } -} - - -} // namespace - - template void advanced_spmv(std::shared_ptr exec, const matrix::Dense *alpha, @@ -169,57 +109,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type num_rows, size_type num_cols, size_type stride, - size_type slice_size, const size_type *__restrict__ slice_lengths, - const size_type *__restrict__ slice_sets, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, ValueType *__restrict__ result) -{ - const auto global_row = - (blockDim.x * blockIdx.x + threadIdx.x) / threads_per_row; - const auto row = global_row % slice_size; - const auto slice = global_row / slice_size; - const auto start_index = threadIdx.x % threads_per_row; - - if (global_row < num_rows) { - for (auto i = start_index; i < slice_lengths[slice]; - i += threads_per_row) { - if (values[(slice_sets[slice] + i) * slice_size + row] != - zero()) { - result[global_row * stride + - col_idxs[(slice_sets[slice] + i) * slice_size + row]] = - values[(slice_sets[slice] + i) * slice_size + row]; - } - } - } -} - - -} // namespace kernel - - template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Sellp *source) + const matrix::Sellp *source, + matrix::Dense *result) { const auto num_rows = source->get_size()[0]; const auto num_cols = source->get_size()[1]; @@ -231,9 +124,8 @@ void convert_to_dense(std::shared_ptr exec, const auto slice_num = ceildiv(num_rows, slice_size); - const dim3 block_size(cuda_config::warp_size, - cuda_config::max_block_size / cuda_config::warp_size, - 1); + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); const dim3 init_grid_dim(ceildiv(result->get_stride(), block_size.x), ceildiv(num_rows, block_size.y), 1); @@ -241,7 +133,7 @@ void convert_to_dense(std::shared_ptr exec, num_rows, num_cols, result->get_stride(), as_cuda_type(result->get_values())); - constexpr auto threads_per_row = cuda_config::warp_size; + constexpr auto threads_per_row = config::warp_size; const auto grid_dim = ceildiv(slice_size * slice_num * threads_per_row, default_block_size); @@ -252,85 +144,14 @@ void convert_to_dense(std::shared_ptr exec, as_cuda_type(result->get_values())); } - GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( - size_type num_rows, size_type slice_size, - const size_type *__restrict__ slice_sets, - const ValueType *__restrict__ values, IndexType *__restrict__ result) -{ - constexpr auto warp_size = cuda_config::warp_size; - const auto tidx = threadIdx.x + blockIdx.x * blockDim.x; - const auto row_idx = tidx / warp_size; - const auto slice_id = row_idx / slice_size; - const auto tid_in_warp = tidx % warp_size; - const auto row_in_slice = row_idx % slice_size; - - if (row_idx < num_rows) { - IndexType part_result{}; - for (size_type sellp_ind = - (slice_sets[slice_id] + tid_in_warp) * slice_size + - row_in_slice; - sellp_ind < slice_sets[slice_id + 1] * slice_size; - sellp_ind += warp_size * slice_size) { - if (values[sellp_ind] != zero()) { - part_result += 1; - } - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - result[row_idx] = reduce( - warp_tile, part_result, - [](const size_type &a, const size_type &b) { return a + b; }); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type slice_size, - const size_type *__restrict__ source_slice_sets, - const IndexType *__restrict__ source_col_idxs, - const ValueType *__restrict__ source_values, - IndexType *__restrict__ result_row_ptrs, - IndexType *__restrict__ result_col_idxs, - ValueType *__restrict__ result_values) -{ - const auto row = threadIdx.x + blockIdx.x * blockDim.x; - const auto slice_id = row / slice_size; - const auto row_in_slice = row % slice_size; - - if (row < num_rows) { - size_type csr_ind = result_row_ptrs[row]; - for (size_type sellp_ind = - source_slice_sets[slice_id] * slice_size + row_in_slice; - sellp_ind < source_slice_sets[slice_id + 1] * slice_size; - sellp_ind += slice_size) { - if (source_values[sellp_ind] != zero()) { - result_values[csr_ind] = source_values[sellp_ind]; - result_col_idxs[csr_ind] = source_col_idxs[sellp_ind]; - csr_ind++; - } - } - } -} - - -} // namespace kernel - - template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Sellp *source) + const matrix::Sellp *source, + matrix::Csr *result) { const auto num_rows = source->get_size()[0]; const auto slice_size = source->get_slice_size(); @@ -345,8 +166,7 @@ void convert_to_csr(std::shared_ptr exec, auto result_col_idxs = result->get_col_idxs(); auto result_row_ptrs = result->get_row_ptrs(); - auto grid_dim = - ceildiv(num_rows * cuda_config::warp_size, default_block_size); + auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); kernel::count_nnz_per_row<<>>( num_rows, slice_size, as_cuda_type(source_slice_sets), @@ -355,13 +175,7 @@ void convert_to_csr(std::shared_ptr exec, grid_dim = ceildiv(num_rows + 1, default_block_size); auto add_values = Array(exec, grid_dim); - start_prefix_sum<<>>( - num_rows + 1, as_cuda_type(result_row_ptrs), - as_cuda_type(add_values.get_data())); - - finalize_prefix_sum<<>>( - num_rows + 1, as_cuda_type(result_row_ptrs), - as_cuda_type(add_values.get_const_data())); + components::prefix_sum(exec, result_row_ptrs, num_rows + 1); grid_dim = ceildiv(num_rows, default_block_size); @@ -370,8 +184,6 @@ void convert_to_csr(std::shared_ptr exec, as_cuda_type(source_col_idxs), as_cuda_type(source_values), as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs), as_cuda_type(result_values)); - - add_values.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -390,15 +202,13 @@ void count_nonzeros(std::shared_ptr exec, auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = - ceildiv(num_rows * cuda_config::warp_size, default_block_size); + auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); kernel::count_nnz_per_row<<>>( num_rows, slice_size, as_cuda_type(slice_sets), as_cuda_type(values), as_cuda_type(nnz_per_row.get_data())); *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); - nnz_per_row.clear(); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 2bdb8372030..69d2e53fe37 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,26 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/sparsity_csr_kernels.hpp" -#include - - #include -#include -#include - - -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/segment_scan.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/zero_array.hpp" namespace gko { @@ -99,10 +80,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void remove_diagonal_elements(std::shared_ptr exec, - matrix::SparsityCsr *matrix, - const IndexType *row_ptrs, - const IndexType *col_idxs) GKO_NOT_IMPLEMENTED; +void remove_diagonal_elements( + std::shared_ptr exec, const IndexType *row_ptrs, + const IndexType *col_idxs, + matrix::SparsityCsr *matrix) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL); @@ -110,11 +91,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void transpose(std::shared_ptr exec, - matrix::SparsityCsr *trans, - const matrix::SparsityCsr *orig) + const matrix::SparsityCsr *orig, + matrix::SparsityCsr *trans) GKO_NOT_IMPLEMENTED; - GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu new file mode 100644 index 00000000000..858c82584d5 --- /dev/null +++ b/cuda/preconditioner/isai_kernels.cu @@ -0,0 +1,160 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/isai_kernels.hpp" + + +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/merging.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Isai preconditioner namespace. + * @ref Isai + * @ingroup isai + */ +namespace isai { + + +constexpr int subwarp_size{row_size_limit}; +constexpr int subwarps_per_block{2}; +constexpr int default_block_size{subwarps_per_block * subwarp_size}; + + +#include "common/preconditioner/isai_kernels.hpp.inc" + + +template +void generate_tri_inverse(std::shared_ptr exec, + const matrix::Csr *input, + matrix::Csr *inverse, + IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs, + bool lower) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + if (lower) { + kernel::generate_l_inverse + <<>>(static_cast(num_rows), + input->get_const_row_ptrs(), + input->get_const_col_idxs(), + as_cuda_type(input->get_const_values()), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + as_cuda_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs); + } else { + kernel::generate_u_inverse + <<>>(static_cast(num_rows), + input->get_const_row_ptrs(), + input->get_const_col_idxs(), + as_cuda_type(input->get_const_values()), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + as_cuda_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs); + } + components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); + components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); + + +template +void generate_excess_system(std::shared_ptr exec, + const matrix::Csr *input, + const matrix::Csr *inverse, + const IndexType *excess_rhs_ptrs, + const IndexType *excess_nz_ptrs, + matrix::Csr *excess_system, + matrix::Dense *excess_rhs) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + kernel::generate_excess_system<<>>( + static_cast(num_rows), input->get_const_row_ptrs(), + input->get_const_col_idxs(), as_cuda_type(input->get_const_values()), + inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(), + excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(), + excess_system->get_col_idxs(), + as_cuda_type(excess_system->get_values()), + as_cuda_type(excess_rhs->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); + + +template +void scatter_excess_solution(std::shared_ptr exec, + const IndexType *excess_rhs_ptrs, + const matrix::Dense *excess_solution, + matrix::Csr *inverse) +{ + const auto num_rows = inverse->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + kernel::copy_excess_solution<<>>( + static_cast(num_rows), inverse->get_const_row_ptrs(), + excess_rhs_ptrs, as_cuda_type(excess_solution->get_const_values()), + as_cuda_type(inverse->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu index 6cc3005f3d9..2dc9aeaf23b 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" @@ -57,84 +58,9 @@ namespace cuda { * @ingroup jacobi */ namespace jacobi { -namespace kernel { - - -template -__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size) - advanced_apply(const ValueType *__restrict__ blocks, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const IndexType *__restrict__ block_ptrs, - size_type num_blocks, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ b, int32 b_stride, - ValueType *__restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = alpha[0] * - b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - multiply_vec( - subwarp, block_size, v, - blocks + storage_scheme.get_global_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType &result, const ValueType &out) { result += out; }); -} - - -template -__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size) - advanced_adaptive_apply( - const ValueType *__restrict__ blocks, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const precision_reduction *__restrict__ block_precisions, - const IndexType *__restrict__ block_ptrs, size_type num_blocks, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ b, - int32 b_stride, ValueType *__restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - auto alpha_val = alpha == nullptr ? one() : alpha[0]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = alpha[0] * - b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, block_precisions[block_id], - multiply_vec( - subwarp, block_size, v, - reinterpret_cast( - blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType &result, const ValueType &out) { result += out; })); -} -} // namespace kernel +#include "common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" namespace { @@ -152,7 +78,7 @@ void advanced_apply( ValueType *x, size_type x_stride) { constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = cuda_config::warp_size / subwarp_size; + constexpr int blocks_per_warp = config::warp_size / subwarp_size; const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); @@ -199,7 +125,7 @@ void apply(std::shared_ptr exec, size_type num_blocks, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + syn::value_list(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, alpha->get_const_values(), diff --git a/cuda/preconditioner/jacobi_common.hpp b/cuda/preconditioner/jacobi_common.hpp index d224e4bdc15..3c76bb78388 100644 --- a/cuda/preconditioner/jacobi_common.hpp +++ b/cuda/preconditioner/jacobi_common.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,11 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include #include +#include "cuda/base/config.hpp" + namespace gko { namespace kernels { namespace cuda { @@ -46,9 +47,10 @@ namespace jacobi { * kernels should be compiled. */ #ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS -using compiled_kernels = syn::as_list>; +using compiled_kernels = syn::as_list>; #else -using compiled_kernels = syn::value_list; +using compiled_kernels = + syn::value_list; #endif diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernel.cu index a933a7b8398..0f1c52e9621 100644 --- a/cuda/preconditioner/jacobi_generate_kernel.cu +++ b/cuda/preconditioner/jacobi_generate_kernel.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,8 +38,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/extended_float.hpp" +#include "core/components/fill_array.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" @@ -47,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" #include "cuda/components/warp_blas.cuh" -#include "cuda/components/zero_array.hpp" #include "cuda/preconditioner/jacobi_common.hpp" @@ -60,185 +61,9 @@ namespace cuda { * @ingroup jacobi */ namespace jacobi { -namespace kernel { -template -__device__ __forceinline__ bool validate_precision_reduction_feasibility( - Group &__restrict__ group, IndexType block_size, - ValueType *__restrict__ row, ValueType *__restrict__ work, size_type stride) -{ - using gko::detail::float_traits; - // save original data and reduce precision - if (group.thread_rank() < block_size) { -#pragma unroll - for (auto i = 0u; i < max_block_size; ++i) { - if (i >= block_size) { - break; - } - work[i * stride + group.thread_rank()] = row[i]; - row[i] = static_cast(static_cast(row[i])); - } - } - - // compute the condition number - auto perm = group.thread_rank(); - auto trans_perm = perm; - auto block_cond = compute_infinity_norm(group, block_size, - block_size, row); - auto succeeded = - invert_block(group, block_size, row, perm, trans_perm); - block_cond *= compute_infinity_norm(group, block_size, - block_size, row); - - // restore original data - if (group.thread_rank() < block_size) { -#pragma unroll - for (auto i = 0u; i < max_block_size; ++i) { - if (i >= block_size) { - break; - } - row[i] = work[i * stride + group.thread_rank()]; - } - } - - return succeeded && block_cond >= 1.0 && - block_cond * float_traits>::eps < 1e-3; -} - - -template -__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size) - generate(size_type num_rows, const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, - ValueType *__restrict__ block_data, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const IndexType *__restrict__ block_ptrs, size_type num_blocks) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto block = group::this_thread_block(); - ValueType row[max_block_size]; - __shared__ UninitializedArray - workspace; - csr::extract_transposed_diag_blocks( - block, cuda_config::warp_size / subwarp_size, row_ptrs, col_idxs, - values, block_ptrs, num_blocks, row, 1, - workspace + threadIdx.z * max_block_size); - const auto subwarp = group::tiled_partition(block); - if (block_id < num_blocks) { - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - auto perm = subwarp.thread_rank(); - auto trans_perm = subwarp.thread_rank(); - invert_block(subwarp, block_size, row, perm, - trans_perm); - copy_matrix( - subwarp, block_size, row, 1, perm, trans_perm, - block_data + storage_scheme.get_global_block_offset(block_id), - storage_scheme.get_stride()); - } -} - - -template -__global__ void -__launch_bounds__(warps_per_block *cuda_config::warp_size) adaptive_generate( - size_type num_rows, const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idxs, - const ValueType *__restrict__ values, remove_complex accuracy, - ValueType *__restrict__ block_data, - preconditioner::block_interleaved_storage_scheme storage_scheme, - remove_complex *__restrict__ conditioning, - precision_reduction *__restrict__ block_precisions, - const IndexType *__restrict__ block_ptrs, size_type num_blocks) -{ - // extract blocks - const auto block_id = - thread::get_subwarp_id(); - const auto block = group::this_thread_block(); - ValueType row[max_block_size]; - __shared__ UninitializedArray - workspace; - csr::extract_transposed_diag_blocks( - block, cuda_config::warp_size / subwarp_size, row_ptrs, col_idxs, - values, block_ptrs, num_blocks, row, 1, - workspace + threadIdx.z * max_block_size); - - // compute inverse and figure out the correct precision - const auto subwarp = group::tiled_partition(block); - const auto block_size = - block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id] - : 0; - auto perm = subwarp.thread_rank(); - auto trans_perm = subwarp.thread_rank(); - auto prec_descriptor = ~uint32{}; - if (block_id < num_blocks) { - auto block_cond = compute_infinity_norm( - subwarp, block_size, block_size, row); - invert_block(subwarp, block_size, row, perm, - trans_perm); - block_cond *= compute_infinity_norm(subwarp, block_size, - block_size, row); - conditioning[block_id] = block_cond; - const auto prec = block_precisions[block_id]; - prec_descriptor = - preconditioner::detail::precision_reduction_descriptor::singleton( - prec); - if (prec == precision_reduction::autodetect()) { - using preconditioner::detail::get_supported_storage_reductions; - prec_descriptor = get_supported_storage_reductions( - accuracy, block_cond, - [&subwarp, &block_size, &row, &block_data, &storage_scheme, - &block_id] { - using target = reduce_precision; - return validate_precision_reduction_feasibility< - max_block_size, target>( - subwarp, block_size, row, - block_data + - storage_scheme.get_global_block_offset(block_id), - storage_scheme.get_stride()); - }, - [&subwarp, &block_size, &row, &block_data, &storage_scheme, - &block_id] { - using target = - reduce_precision>; - return validate_precision_reduction_feasibility< - max_block_size, target>( - subwarp, block_size, row, - block_data + - storage_scheme.get_global_block_offset(block_id), - storage_scheme.get_stride()); - }); - } - } - - // make sure all blocks in the group have the same precision - const auto warp = group::tiled_partition(block); - const auto prec = - preconditioner::detail::get_optimal_storage_reduction(reduce( - warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; })); - - // store the block back into memory - if (block_id < num_blocks) { - block_precisions[block_id] = prec; - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, prec, - copy_matrix( - subwarp, block_size, row, 1, perm, trans_perm, - reinterpret_cast( - block_data + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id), - storage_scheme.get_stride())); - } -} - - -} // namespace kernel +#include "common/preconditioner/jacobi_generate_kernel.hpp.inc" namespace { @@ -256,7 +81,7 @@ void generate(syn::value_list, const IndexType *block_ptrs, size_type num_blocks) { constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = cuda_config::warp_size / subwarp_size; + constexpr int blocks_per_warp = config::warp_size / subwarp_size; const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); @@ -297,16 +122,17 @@ void generate(std::shared_ptr exec, Array &block_precisions, const Array &block_pointers, Array &blocks) { - zero_array(blocks.get_num_elems(), blocks.get_data()); - select_generate(compiled_kernels(), - [&](int compiled_block_size) { - return max_block_size <= compiled_block_size; - }, - syn::value_list(), - syn::type_list<>(), system_matrix, accuracy, - blocks.get_data(), storage_scheme, conditioning.get_data(), - block_precisions.get_data(), - block_pointers.get_const_data(), num_blocks); + components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(), + zero()); + select_generate( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), syn::type_list<>(), + system_matrix, accuracy, blocks.get_data(), storage_scheme, + conditioning.get_data(), block_precisions.get_data(), + block_pointers.get_const_data(), num_blocks); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index 4f3bcb17f97..e0662499762 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -37,9 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/preconditioner/jacobi_common.hpp" namespace gko { @@ -55,98 +60,13 @@ namespace { // a total of 32 warps (1024 threads) -constexpr int default_block_size = 32; +constexpr int default_num_warps = 32; // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; -template -__global__ -__launch_bounds__(warps_per_block *cuda_config::warp_size) void duplicate_array( - const precision_reduction *__restrict__ source, size_type source_size, - precision_reduction *__restrict__ dest, size_type dest_size) -{ - auto grid = group::this_grid(); - if (grid.thread_rank() >= dest_size) { - return; - } - for (auto i = grid.thread_rank(); i < dest_size; i += grid.size()) { - dest[i] = source[i % source_size]; - } -} - - -template -__global__ void compare_adjacent_rows(size_type num_rows, int32 max_block_size, - const IndexType *__restrict__ row_ptrs, - const IndexType *__restrict__ col_idx, - bool *__restrict__ matching_next_row) -{ - const auto global_tid = blockDim.x * blockIdx.x + threadIdx.x; - const auto local_tid = threadIdx.x % cuda_config::warp_size; - const auto warp_id = global_tid / cuda_config::warp_size; - const auto warp = group::tiled_partition( - group::this_thread_block()); - - if (warp_id >= num_rows - 1) { - return; - } - - const auto curr_row_start = row_ptrs[warp_id]; - const auto next_row_start = row_ptrs[warp_id + 1]; - const auto next_row_end = row_ptrs[warp_id + 2]; - - const auto nz_this_row = next_row_end - next_row_start; - const auto nz_prev_row = next_row_start - curr_row_start; - - if (nz_this_row != nz_prev_row) { - matching_next_row[warp_id] = false; - return; - } - size_type steps = ceildiv(nz_this_row, cuda_config::warp_size); - for (size_type i = 0; i < steps; i++) { - auto j = local_tid + i * cuda_config::warp_size; - auto prev_col = (curr_row_start + j < next_row_start) - ? col_idx[curr_row_start + j] - : 0; - auto this_col = (curr_row_start + j < next_row_start) - ? col_idx[next_row_start + j] - : 0; - if (warp.any(prev_col != this_col)) { - matching_next_row[warp_id] = false; - return; - } - } - matching_next_row[warp_id] = true; -} - - -template -__global__ void generate_natural_block_pointer( - size_type num_rows, int32 max_block_size, - const bool *__restrict__ matching_next_row, - IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr) -{ - block_ptrs[0] = 0; - if (num_rows == 0) { - return; - } - size_type num_blocks = 1; - int32 current_block_size = 1; - for (size_type i = 0; i < num_rows - 1; ++i) { - if ((matching_next_row[i]) && (current_block_size < max_block_size)) { - ++current_block_size; - } else { - block_ptrs[num_blocks] = - block_ptrs[num_blocks - 1] + current_block_size; - ++num_blocks; - current_block_size = 1; - } - } - block_ptrs[num_blocks] = block_ptrs[num_blocks - 1] + current_block_size; - num_blocks_arr[0] = num_blocks; -} +#include "common/preconditioner/jacobi_kernels.hpp.inc" template @@ -159,10 +79,9 @@ size_type find_natural_blocks(std::shared_ptr exec, Array matching_next_row(exec, mtx->get_size()[0] - 1); - const dim3 block_size(default_block_size, 1, 1); + const dim3 block_size(config::warp_size, 1, 1); const dim3 grid_size( - ceildiv(mtx->get_size()[0] * cuda_config::warp_size, block_size.x), 1, - 1); + ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1); compare_adjacent_rows<<>>( mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), matching_next_row.get_data()); @@ -174,32 +93,6 @@ size_type find_natural_blocks(std::shared_ptr exec, } -template -__global__ void agglomerate_supervariables_kernel( - int32 max_block_size, size_type num_natural_blocks, - IndexType *__restrict__ block_ptrs, size_type *__restrict__ num_blocks_arr) -{ - num_blocks_arr[0] = 0; - if (num_natural_blocks == 0) { - return; - } - size_type num_blocks = 1; - int32 current_block_size = block_ptrs[1] - block_ptrs[0]; - for (size_type i = 1; i < num_natural_blocks; ++i) { - const int32 block_size = block_ptrs[i + 1] - block_ptrs[i]; - if (current_block_size + block_size <= max_block_size) { - current_block_size += block_size; - } else { - block_ptrs[num_blocks] = block_ptrs[i]; - ++num_blocks; - current_block_size = block_size; - } - } - block_ptrs[num_blocks] = block_ptrs[num_natural_blocks]; - num_blocks_arr[0] = num_blocks; -} - - template inline size_type agglomerate_supervariables( std::shared_ptr exec, int32 max_block_size, @@ -222,11 +115,11 @@ void initialize_precisions(std::shared_ptr exec, const Array &source, Array &precisions) { - const auto block_size = default_block_size * cuda_config::warp_size; + const auto block_size = default_num_warps * config::warp_size; const auto grid_size = min( default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); - duplicate_array<<>>( + duplicate_array<<>>( source.get_const_data(), source.get_num_elems(), precisions.get_data(), precisions.get_num_elems()); } @@ -248,6 +141,93 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); +namespace { + + +template +void transpose_jacobi( + syn::value_list, size_type num_blocks, + const precision_reduction *block_precisions, + const IndexType *block_pointers, const ValueType *blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + ValueType *out_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + adaptive_transpose_jacobi + <<>>( + as_cuda_type(blocks), storage_scheme, block_precisions, + block_pointers, num_blocks, as_cuda_type(out_blocks)); + } else { + transpose_jacobi<<>>( + as_cuda_type(blocks), storage_scheme, block_pointers, num_blocks, + as_cuda_type(out_blocks)); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi); + + +} // namespace + + +template +void transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + select_transpose_jacobi( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, out_blocks.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); + + +template +void conj_transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + select_transpose_jacobi( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, out_blocks.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); + + template void convert_to_dense( std::shared_ptr exec, size_type num_blocks, diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernel.cu index a5dfd71fda7..fb6721bbdca 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernel.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" @@ -57,79 +58,9 @@ namespace cuda { * @ingroup jacobi */ namespace jacobi { -namespace kernel { - - -template -__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size) - apply(const ValueType *__restrict__ blocks, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const IndexType *__restrict__ block_ptrs, size_type num_blocks, - const ValueType *__restrict__ b, int32 b_stride, - ValueType *__restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - multiply_vec( - subwarp, block_size, v, - blocks + storage_scheme.get_global_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType &result, const ValueType &out) { result = out; }); -} - - -template -__global__ void __launch_bounds__(warps_per_block *cuda_config::warp_size) - adaptive_apply(const ValueType *__restrict__ blocks, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const precision_reduction *__restrict__ block_precisions, - const IndexType *__restrict__ block_ptrs, - size_type num_blocks, const ValueType *__restrict__ b, - int32 b_stride, ValueType *__restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, block_precisions[block_id], - multiply_vec( - subwarp, block_size, v, - reinterpret_cast( - blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType &result, const ValueType &out) { result = out; })); -} -} // namespace kernel +#include "common/preconditioner/jacobi_simple_apply_kernel.hpp.inc" namespace { @@ -146,7 +77,7 @@ void apply(syn::value_list, size_type num_blocks, size_type x_stride) { constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = cuda_config::warp_size / subwarp_size; + constexpr int blocks_per_warp = config::warp_size / subwarp_size; const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); @@ -184,16 +115,16 @@ void simple_apply( { // TODO: write a special kernel for multiple RHS for (size_type col = 0; col < b->get_size()[1]; ++col) { - select_apply(compiled_kernels(), - [&](int compiled_block_size) { - return max_block_size <= compiled_block_size; - }, - syn::value_list(), - syn::type_list<>(), num_blocks, - block_precisions.get_const_data(), - block_pointers.get_const_data(), blocks.get_const_data(), - storage_scheme, b->get_const_values() + col, - b->get_stride(), x->get_values() + col, x->get_stride()); + select_apply( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, b->get_const_values() + col, b->get_stride(), + x->get_values() + col, x->get_stride()); } } diff --git a/cuda/solver/bicg_kernels.cu b/cuda/solver/bicg_kernels.cu new file mode 100644 index 00000000000..175198d26d9 --- /dev/null +++ b/cuda/solver/bicg_kernels.cu @@ -0,0 +1,144 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/bicg_kernels.hpp" + + +#include +#include + + +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The BICG solver namespace. + * + * @ingroup bicg + */ +namespace bicg { + + +constexpr int default_block_size = 512; + + +#include "common/solver/bicg_kernels.hpp.inc" + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *z, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *prev_rho, + matrix::Dense *rho, matrix::Dense *r2, + matrix::Dense *z2, matrix::Dense *p2, + matrix::Dense *q2, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1); + + initialize_kernel<<>>( + b->get_size()[0], b->get_size()[1], b->get_stride(), + as_cuda_type(b->get_const_values()), as_cuda_type(r->get_values()), + as_cuda_type(z->get_values()), as_cuda_type(p->get_values()), + as_cuda_type(q->get_values()), as_cuda_type(r2->get_values()), + as_cuda_type(z2->get_values()), as_cuda_type(p2->get_values()), + as_cuda_type(q2->get_values()), as_cuda_type(prev_rho->get_values()), + as_cuda_type(rho->get_values()), as_cuda_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + matrix::Dense *p, const matrix::Dense *z, + matrix::Dense *p2, const matrix::Dense *z2, + const matrix::Dense *rho, + const matrix::Dense *prev_rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + step_1_kernel<<>>( + p->get_size()[0], p->get_size()[1], p->get_stride(), + as_cuda_type(p->get_values()), as_cuda_type(z->get_const_values()), + as_cuda_type(p2->get_values()), as_cuda_type(z2->get_const_values()), + as_cuda_type(rho->get_const_values()), + as_cuda_type(prev_rho->get_const_values()), + as_cuda_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + matrix::Dense *x, matrix::Dense *r, + matrix::Dense *r2, const matrix::Dense *p, + const matrix::Dense *q, + const matrix::Dense *q2, + const matrix::Dense *beta, + const matrix::Dense *rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + step_2_kernel<<>>( + p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(), + as_cuda_type(x->get_values()), as_cuda_type(r->get_values()), + as_cuda_type(r2->get_values()), as_cuda_type(p->get_const_values()), + as_cuda_type(q->get_const_values()), + as_cuda_type(q2->get_const_values()), + as_cuda_type(beta->get_const_values()), + as_cuda_type(rho->get_const_values()), + as_cuda_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); + + +} // namespace bicg +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/bicgstab_kernels.cu b/cuda/solver/bicgstab_kernels.cu index 1b3c4824019..a0e5376cf69 100644 --- a/cuda/solver/bicgstab_kernels.cu +++ b/cuda/solver/bicgstab_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -55,42 +56,7 @@ namespace bicgstab { constexpr int default_block_size = 512; -template -__global__ __launch_bounds__(default_block_size) void initialize_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ b, ValueType *__restrict__ r, - ValueType *__restrict__ rr, ValueType *__restrict__ y, - ValueType *__restrict__ s, ValueType *__restrict__ t, - ValueType *__restrict__ z, ValueType *__restrict__ v, - ValueType *__restrict__ p, ValueType *__restrict__ prev_rho, - ValueType *__restrict__ rho, ValueType *__restrict__ alpha, - ValueType *__restrict__ beta, ValueType *__restrict__ gamma, - ValueType *__restrict__ omega, stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - - if (tidx < num_cols) { - prev_rho[tidx] = one(); - rho[tidx] = one(); - alpha[tidx] = one(); - beta[tidx] = one(); - gamma[tidx] = one(); - omega[tidx] = one(); - stop_status[tidx].reset(); - } - - if (tidx < num_rows * stride) { - r[tidx] = b[tidx]; - rr[tidx] = zero(); - y[tidx] = zero(); - s[tidx] = zero(); - t[tidx] = zero(); - z[tidx] = zero(); - v[tidx] = zero(); - p[tidx] = zero(); - } -} +#include "common/solver/bicgstab_kernels.hpp.inc" template @@ -125,31 +91,6 @@ void initialize(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_1_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ r, ValueType *__restrict__ p, - const ValueType *__restrict__ v, const ValueType *__restrict__ rho, - const ValueType *__restrict__ prev_rho, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ omega, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - auto res = r[tidx]; - if (prev_rho[col] * omega[col] != zero()) { - const auto tmp = (rho[col] / prev_rho[col]) * (alpha[col] / omega[col]); - res += tmp * (p[tidx] - omega[col] * v[tidx]); - } - p[tidx] = res; -} - - template void step_1(std::shared_ptr exec, const matrix::Dense *r, matrix::Dense *p, @@ -178,32 +119,6 @@ void step_1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_2_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ r, ValueType *__restrict__ s, - const ValueType *__restrict__ v, const ValueType *__restrict__ rho, - ValueType *__restrict__ alpha, const ValueType *__restrict__ beta, - const stopping_status *__restrict__ stop_status) -{ - const size_type tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const size_type col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - auto t_alpha = zero(); - auto t_s = r[tidx]; - if (beta[col] != zero()) { - t_alpha = rho[col] / beta[col]; - t_s -= t_alpha * v[tidx]; - } - alpha[col] = t_alpha; - s[tidx] = t_s; -} - - template void step_2(std::shared_ptr exec, const matrix::Dense *r, matrix::Dense *s, @@ -230,39 +145,6 @@ void step_2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_3_kernel( - size_type num_rows, size_type num_cols, size_type stride, - size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, - const ValueType *__restrict__ s, const ValueType *__restrict__ t, - const ValueType *__restrict__ y, const ValueType *__restrict__ z, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ beta, - const ValueType *__restrict__ gamma, ValueType *__restrict__ omega, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto row = tidx / stride; - const auto col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - const auto x_pos = row * x_stride + col; - auto t_omega = zero(); - auto t_x = x[x_pos] + alpha[col] * y[tidx]; - auto t_r = s[tidx]; - if (beta[col] != zero()) { - t_omega = gamma[col] / beta[col]; - t_x += t_omega * z[tidx]; - t_r -= t_omega * t[tidx]; - } - omega[col] = t_omega; - x[x_pos] = t_x; - r[tidx] = t_r; -} - - template void step_3( std::shared_ptr exec, matrix::Dense *x, @@ -289,28 +171,8 @@ void step_3( as_cuda_type(omega->get_values()), as_cuda_type(stop_status->get_const_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); - -template -__global__ __launch_bounds__(default_block_size) void finalize_kernel( - size_type num_rows, size_type num_cols, size_type stride, - size_type x_stride, ValueType *__restrict__ x, - const ValueType *__restrict__ y, const ValueType *__restrict__ alpha, - stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto row = tidx / stride; - const auto col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].is_finalized() || !stop_status[col].has_stopped()) { - return; - } - const auto x_pos = row * x_stride + col; - x[x_pos] = x[x_pos] + alpha[col] * y[tidx]; - stop_status[col].finalize(); -} +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); template diff --git a/cuda/solver/cg_kernels.cu b/cuda/solver/cg_kernels.cu index 2e39762e7f2..9adb589a9ea 100644 --- a/cuda/solver/cg_kernels.cu +++ b/cuda/solver/cg_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -55,30 +56,7 @@ namespace cg { constexpr int default_block_size = 512; -template -__global__ __launch_bounds__(default_block_size) void initialize_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ b, ValueType *__restrict__ r, - ValueType *__restrict__ z, ValueType *__restrict__ p, - ValueType *__restrict__ q, ValueType *__restrict__ prev_rho, - ValueType *__restrict__ rho, stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - - if (tidx < num_cols) { - rho[tidx] = zero(); - prev_rho[tidx] = one(); - stop_status[tidx].reset(); - } - - if (tidx < num_rows * stride) { - r[tidx] = b[tidx]; - z[tidx] = zero(); - p[tidx] = zero(); - q[tidx] = zero(); - } -} +#include "common/solver/cg_kernels.hpp.inc" template @@ -104,26 +82,6 @@ void initialize(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_1_kernel( - size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ p, const ValueType *__restrict__ z, - const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - const auto tmp = rho[col] / prev_rho[col]; - p[tidx] = - prev_rho[col] == zero() ? z[tidx] : z[tidx] + tmp * p[tidx]; -} - - template void step_1(std::shared_ptr exec, matrix::Dense *p, const matrix::Dense *z, @@ -146,31 +104,6 @@ void step_1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_2_kernel( - size_type num_rows, size_type num_cols, size_type stride, - size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, - const ValueType *__restrict__ p, const ValueType *__restrict__ q, - const ValueType *__restrict__ beta, const ValueType *__restrict__ rho, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto row = tidx / stride; - const auto col = tidx % stride; - - if (col >= num_cols || tidx >= num_rows * num_cols || - stop_status[col].has_stopped()) { - return; - } - if (beta[col] != zero()) { - const auto tmp = rho[col] / beta[col]; - x[row * x_stride + col] += tmp * p[tidx]; - r[tidx] -= tmp * q[tidx]; - } -} - - template void step_2(std::shared_ptr exec, matrix::Dense *x, matrix::Dense *r, diff --git a/cuda/solver/cgs_kernels.cu b/cuda/solver/cgs_kernels.cu index c36536d4b93..1c1b1af6b48 100644 --- a/cuda/solver/cgs_kernels.cu +++ b/cuda/solver/cgs_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -55,41 +56,7 @@ namespace cgs { constexpr int default_block_size = 512; -template -__global__ __launch_bounds__(default_block_size) void initialize_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ b, ValueType *__restrict__ r, - ValueType *__restrict__ r_tld, ValueType *__restrict__ p, - ValueType *__restrict__ q, ValueType *__restrict__ u, - ValueType *__restrict__ u_hat, ValueType *__restrict__ v_hat, - ValueType *__restrict__ t, ValueType *__restrict__ alpha, - ValueType *__restrict__ beta, ValueType *__restrict__ gamma, - ValueType *__restrict__ rho_prev, ValueType *__restrict__ rho, - stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - - if (tidx < num_cols) { - rho[tidx] = zero(); - alpha[tidx] = one(); - beta[tidx] = one(); - gamma[tidx] = one(); - rho_prev[tidx] = one(); - stop_status[tidx].reset(); - } - - if (tidx < num_rows * stride) { - r[tidx] = b[tidx]; - r_tld[tidx] = b[tidx]; - u[tidx] = zero(); - p[tidx] = zero(); - q[tidx] = zero(); - u_hat[tidx] = zero(); - v_hat[tidx] = zero(); - t[tidx] = zero(); - } -} +#include "common/solver/cgs_kernels.hpp.inc" template @@ -124,31 +91,6 @@ void initialize(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_1_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ r, ValueType *__restrict__ u, - ValueType *__restrict__ p, const ValueType *__restrict__ q, - ValueType *__restrict__ beta, const ValueType *__restrict__ rho, - const ValueType *__restrict__ rho_prev, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto col = tidx % stride; - - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - if (rho_prev[col] != zero()) { - beta[col] = rho[col] / rho_prev[col]; - u[tidx] = r[tidx] + beta[col] * q[tidx]; - p[tidx] = u[tidx] + beta[col] * (q[tidx] + beta[col] * p[tidx]); - } -} - - template void step_1(std::shared_ptr exec, const matrix::Dense *r, matrix::Dense *u, @@ -173,31 +115,6 @@ void step_1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_2_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ u, const ValueType *__restrict__ v_hat, - ValueType *__restrict__ q, ValueType *__restrict__ t, - ValueType *__restrict__ alpha, const ValueType *__restrict__ rho, - const ValueType *__restrict__ gamma, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto col = tidx % stride; - - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - if (gamma[col] != zero()) { - alpha[col] = rho[col] / gamma[col]; - q[tidx] = u[tidx] - alpha[col] * v_hat[tidx]; - t[tidx] = u[tidx] + q[tidx]; - } -} - - template void step_2(std::shared_ptr exec, const matrix::Dense *u, @@ -224,30 +141,6 @@ void step_2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_3_kernel( - size_type num_rows, size_type num_cols, size_type stride, - size_type x_stride, const ValueType *__restrict__ t, - const ValueType *__restrict__ v_hat, ValueType *__restrict__ r, - ValueType *__restrict__ x, const ValueType *__restrict__ alpha, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto row = tidx / stride; - const auto col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - const auto x_pos = row * x_stride + col; - auto t_x = x[x_pos] + alpha[col] * v_hat[tidx]; - auto t_r = r[tidx] - alpha[col] * t[tidx]; - x[x_pos] = t_x; - r[tidx] = t_r; -} - - template void step_3(std::shared_ptr exec, const matrix::Dense *t, diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 3b1fb1f1fd2..f16be5ee0e1 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -49,13 +49,112 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/device_guard.hpp" #include "cuda/base/math.hpp" #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/types.hpp" namespace gko { +namespace solver { + + +struct SolveStruct { + virtual void dummy() {} +}; + + +namespace cuda { + + +#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) + + +struct SolveStruct : gko::solver::SolveStruct { + int algorithm; + csrsm2Info_t solve_info; + cusparseSolvePolicy_t policy; + cusparseMatDescr_t factor_descr; + size_t factor_work_size; + void *factor_work_vec; + SolveStruct() + { + factor_work_vec = nullptr; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&solve_info)); + algorithm = 0; + policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; + } + + SolveStruct(const SolveStruct &) = delete; + + SolveStruct(SolveStruct &&) = delete; + + SolveStruct &operator=(const SolveStruct &) = delete; + + SolveStruct &operator=(SolveStruct &&) = delete; + + ~SolveStruct() + { + cusparseDestroyMatDescr(factor_descr); + if (solve_info) { + cusparseDestroyCsrsm2Info(solve_info); + } + if (factor_work_vec != nullptr) { + cudaFree(factor_work_vec); + factor_work_vec = nullptr; + } + } +}; + + +#elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) + + +struct SolveStruct : gko::solver::SolveStruct { + cusparseSolveAnalysisInfo_t solve_info; + cusparseMatDescr_t factor_descr; + SolveStruct() + { + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateSolveAnalysisInfo(&solve_info)); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); + } + + SolveStruct(const SolveStruct &) = delete; + + SolveStruct(SolveStruct &&) = delete; + + SolveStruct &operator=(const SolveStruct &) = delete; + + SolveStruct &operator=(SolveStruct &&) = delete; + + ~SolveStruct() + { + cusparseDestroyMatDescr(factor_descr); + cusparseDestroySolveAnalysisInfo(solve_info); + } +}; + + +#endif + + +} // namespace cuda +} // namespace solver + + namespace kernels { namespace cuda { namespace { @@ -83,7 +182,7 @@ void should_perform_transpose_kernel(std::shared_ptr exec, void init_struct_kernel(std::shared_ptr exec, std::shared_ptr &solve_struct) { - solve_struct = std::make_shared(); + solve_struct = std::make_shared(); } @@ -94,65 +193,73 @@ void generate_kernel(std::shared_ptr exec, const gko::size_type num_rhs, bool is_upper) { if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); - if (is_upper) { - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode( - solve_struct->factor_descr, CUSPARSE_FILL_MODE_UPPER)); - } + if (auto cuda_solve_struct = + dynamic_cast(solve_struct)) { + auto handle = exec->get_cusparse_handle(); + if (is_upper) { + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode( + cuda_solve_struct->factor_descr, CUSPARSE_FILL_MODE_UPPER)); + } #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) - ValueType one = 1.0; + ValueType one = 1.0; - { - cusparse::pointer_mode_guard pm_guard(handle); - cusparse::buffer_size_ext( - handle, solve_struct->algorithm, - CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - matrix->get_size()[0], num_rhs, - matrix->get_num_stored_elements(), &one, - solve_struct->factor_descr, matrix->get_const_values(), - matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), - nullptr, num_rhs, solve_struct->solve_info, - solve_struct->policy, &solve_struct->factor_work_size); - - // allocate workspace - if (solve_struct->factor_work_vec != nullptr) { - exec->free(solve_struct->factor_work_vec); + { + cusparse::pointer_mode_guard pm_guard(handle); + cusparse::buffer_size_ext( + handle, cuda_solve_struct->algorithm, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], + num_rhs, matrix->get_num_stored_elements(), &one, + cuda_solve_struct->factor_descr, matrix->get_const_values(), + matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), + nullptr, num_rhs, cuda_solve_struct->solve_info, + cuda_solve_struct->policy, + &cuda_solve_struct->factor_work_size); + + // allocate workspace + if (cuda_solve_struct->factor_work_vec != nullptr) { + exec->free(cuda_solve_struct->factor_work_vec); + } + cuda_solve_struct->factor_work_vec = + exec->alloc(cuda_solve_struct->factor_work_size); + + cusparse::csrsm2_analysis( + handle, cuda_solve_struct->algorithm, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], + num_rhs, matrix->get_num_stored_elements(), &one, + cuda_solve_struct->factor_descr, matrix->get_const_values(), + matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), + nullptr, num_rhs, cuda_solve_struct->solve_info, + cuda_solve_struct->policy, + cuda_solve_struct->factor_work_vec); } - solve_struct->factor_work_vec = - exec->alloc(solve_struct->factor_work_size); - - cusparse::csrsm2_analysis( - handle, solve_struct->algorithm, - CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - matrix->get_size()[0], num_rhs, - matrix->get_num_stored_elements(), &one, - solve_struct->factor_descr, matrix->get_const_values(), - matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), - nullptr, num_rhs, solve_struct->solve_info, - solve_struct->policy, solve_struct->factor_work_vec); - } #elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) - { - cusparse::pointer_mode_guard pm_guard(handle); - cusparse::csrsm_analysis( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], - matrix->get_num_stored_elements(), solve_struct->factor_descr, - matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), solve_struct->solve_info); - } + { + cusparse::pointer_mode_guard pm_guard(handle); + cusparse::csrsm_analysis( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], matrix->get_num_stored_elements(), + cuda_solve_struct->factor_descr, matrix->get_const_values(), + matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), + cuda_solve_struct->solve_info); + } #endif + } else { + GKO_NOT_SUPPORTED(solve_struct); + } } else { GKO_NOT_IMPLEMENTED; } @@ -169,62 +276,72 @@ void solve_kernel(std::shared_ptr exec, matrix::Dense *x) { using vec = matrix::Dense; + if (cusparse::is_supported::value) { - ValueType one = 1.0; - auto handle = exec->get_cusparse_handle(); + if (auto cuda_solve_struct = + dynamic_cast(solve_struct)) { + ValueType one = 1.0; + auto handle = exec->get_cusparse_handle(); #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) - x->copy_from(gko::lend(b)); - { - cusparse::pointer_mode_guard pm_guard(handle); - cusparse::csrsm2_solve( - handle, solve_struct->algorithm, - CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - matrix->get_size()[0], b->get_stride(), - matrix->get_num_stored_elements(), &one, - solve_struct->factor_descr, matrix->get_const_values(), - matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), - x->get_values(), b->get_stride(), solve_struct->solve_info, - solve_struct->policy, solve_struct->factor_work_vec); - } + x->copy_from(gko::lend(b)); + { + cusparse::pointer_mode_guard pm_guard(handle); + cusparse::csrsm2_solve( + handle, cuda_solve_struct->algorithm, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], + b->get_stride(), matrix->get_num_stored_elements(), &one, + cuda_solve_struct->factor_descr, matrix->get_const_values(), + matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), + x->get_values(), b->get_stride(), + cuda_solve_struct->solve_info, cuda_solve_struct->policy, + cuda_solve_struct->factor_work_vec); + } #elif (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) - { - cusparse::pointer_mode_guard pm_guard(handle); - if (b->get_stride() == 1) { - auto temp_b = const_cast(b->get_const_values()); - cusparse::csrsm_solve( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - matrix->get_size()[0], b->get_stride(), &one, - solve_struct->factor_descr, matrix->get_const_values(), - matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), - solve_struct->solve_info, temp_b, b->get_size()[0], - x->get_values(), x->get_size()[0]); - } else { - dense::transpose(exec, trans_b, b); - dense::transpose(exec, trans_x, x); - cusparse::csrsm_solve( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - matrix->get_size()[0], trans_b->get_size()[0], &one, - solve_struct->factor_descr, matrix->get_const_values(), - matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), - solve_struct->solve_info, trans_b->get_values(), - trans_b->get_size()[1], trans_x->get_values(), - trans_x->get_size()[1]); - dense::transpose(exec, x, trans_x); + { + cusparse::pointer_mode_guard pm_guard(handle); + if (b->get_stride() == 1) { + cusparse::csrsm_solve( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], b->get_stride(), &one, + cuda_solve_struct->factor_descr, + matrix->get_const_values(), + matrix->get_const_row_ptrs(), + matrix->get_const_col_idxs(), + cuda_solve_struct->solve_info, b->get_const_values(), + b->get_size()[0], x->get_values(), x->get_size()[0]); + } else { + dense::transpose(exec, b, trans_b); + dense::transpose(exec, x, trans_x); + cusparse::csrsm_solve( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], trans_b->get_size()[0], &one, + cuda_solve_struct->factor_descr, + matrix->get_const_values(), + matrix->get_const_row_ptrs(), + matrix->get_const_col_idxs(), + cuda_solve_struct->solve_info, trans_b->get_values(), + trans_b->get_size()[1], trans_x->get_values(), + trans_x->get_size()[1]); + dense::transpose(exec, trans_x, x); + } } - } #endif + } else { + GKO_NOT_SUPPORTED(solve_struct); + } } else { GKO_NOT_IMPLEMENTED; } @@ -237,4 +354,4 @@ void solve_kernel(std::shared_ptr exec, } // namespace gko -#endif +#endif // GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_ diff --git a/cuda/solver/fcg_kernels.cu b/cuda/solver/fcg_kernels.cu index b85c14cff91..ed92ca19120 100644 --- a/cuda/solver/fcg_kernels.cu +++ b/cuda/solver/fcg_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -54,33 +55,8 @@ namespace fcg { constexpr int default_block_size = 512; -template -__global__ __launch_bounds__(default_block_size) void initialize_kernel( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType *__restrict__ b, ValueType *__restrict__ r, - ValueType *__restrict__ z, ValueType *__restrict__ p, - ValueType *__restrict__ q, ValueType *__restrict__ t, - ValueType *__restrict__ prev_rho, ValueType *__restrict__ rho, - ValueType *__restrict__ rho_t, stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - - if (tidx < num_cols) { - rho[tidx] = zero(); - prev_rho[tidx] = one(); - rho_t[tidx] = one(); - stop_status[tidx].reset(); - } - - if (tidx < num_rows * stride) { - r[tidx] = b[tidx]; - z[tidx] = zero(); - p[tidx] = zero(); - q[tidx] = zero(); - t[tidx] = b[tidx]; - } -} + +#include "common/solver/fcg_kernels.hpp.inc" template @@ -109,26 +85,6 @@ void initialize(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_1_kernel( - size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ p, const ValueType *__restrict__ z, - const ValueType *__restrict__ rho, const ValueType *__restrict__ prev_rho, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto col = tidx % stride; - if (col >= num_cols || tidx >= num_rows * stride || - stop_status[col].has_stopped()) { - return; - } - const auto tmp = rho[col] / prev_rho[col]; - p[tidx] = - prev_rho[col] == zero() ? z[tidx] : z[tidx] + tmp * p[tidx]; -} - - template void step_1(std::shared_ptr exec, matrix::Dense *p, const matrix::Dense *z, @@ -151,34 +107,6 @@ void step_1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL); -template -__global__ __launch_bounds__(default_block_size) void step_2_kernel( - size_type num_rows, size_type num_cols, size_type stride, - size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r, - ValueType *__restrict__ t, const ValueType *__restrict__ p, - const ValueType *__restrict__ q, const ValueType *__restrict__ beta, - const ValueType *__restrict__ rho, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto row = tidx / stride; - const auto col = tidx % stride; - - if (col >= num_cols || tidx >= num_rows * num_cols || - stop_status[col].has_stopped()) { - return; - } - if (beta[col] != zero()) { - const auto tmp = rho[col] / beta[col]; - const auto prev_r = r[tidx]; - x[row * x_stride + col] += tmp * p[tidx]; - r[tidx] -= tmp * q[tidx]; - t[tidx] = r[tidx] - prev_r; - } -} - - template void step_2(std::shared_ptr exec, matrix::Dense *x, matrix::Dense *r, diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index 56496cf5dd8..0ddddfc74f7 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,12 +42,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array.hpp" +#include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -63,49 +66,18 @@ namespace gmres { constexpr int default_block_size = 512; -constexpr int default_dot_dim = cuda_config::warp_size; +// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block +// size limit. +constexpr int default_dot_dim = 32; constexpr int default_dot_size = default_dot_dim * default_dot_dim; -// Must be called with at least `max(stride_b * num_rows, krylov_dim * -// num_cols)` threads in total. -template -__global__ __launch_bounds__(block_size) void initialize_1_kernel( - size_type num_rows, size_type num_cols, size_type krylov_dim, - const ValueType *__restrict__ b, size_type stride_b, - ValueType *__restrict__ residual, size_type stride_residual, - ValueType *__restrict__ givens_sin, size_type stride_sin, - ValueType *__restrict__ givens_cos, size_type stride_cos, - stopping_status *__restrict__ stop_status) -{ - const auto global_id = blockIdx.x * blockDim.x + threadIdx.x; - - const auto row_idx = global_id / stride_b; - const auto col_idx = global_id % stride_b; - - if (global_id < num_cols) { - stop_status[global_id].reset(); - } - - if (row_idx < num_rows && col_idx < num_cols) { - residual[row_idx * stride_residual + col_idx] = - b[row_idx * stride_b + col_idx]; - } - - if (global_id < krylov_dim * num_cols) { - const auto row_givens = global_id / num_cols; - const auto col_givens = global_id % num_cols; - - givens_sin[row_givens * stride_sin + col_givens] = zero(); - givens_cos[row_givens * stride_cos + col_givens] = zero(); - } -} +#include "common/solver/gmres_kernels.hpp.inc" template void initialize_1(std::shared_ptr exec, const matrix::Dense *b, - matrix::Dense *b_norm, matrix::Dense *residual, matrix::Dense *givens_sin, matrix::Dense *givens_cos, @@ -117,7 +89,6 @@ void initialize_1(std::shared_ptr exec, const dim3 block_dim(default_block_size, 1, 1); constexpr auto block_size = default_block_size; - b->compute_norm2(b_norm); initialize_1_kernel<<>>( b->get_size()[0], b->get_size()[1], krylov_dim, as_cuda_type(b->get_const_values()), b->get_stride(), @@ -130,60 +101,10 @@ void initialize_1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); -// Must be called with at least `num_rows * stride_krylov` threads in total. -template -__global__ __launch_bounds__(block_size) void initialize_2_1_kernel( - size_type num_rows, size_type num_rhs, size_type krylov_dim, - ValueType *__restrict__ krylov_bases, size_type stride_krylov, - ValueType *__restrict__ residual_norm_collection, - size_type stride_residual_nc) -{ - const auto global_id = blockIdx.x * blockDim.x + threadIdx.x; - const auto row_idx = global_id / stride_krylov; - const auto col_idx = global_id % stride_krylov; - - if (row_idx < num_rows && col_idx < (krylov_dim + 1) * num_rhs) { - krylov_bases[row_idx * stride_krylov + col_idx] = zero(); - } - - if (row_idx < krylov_dim + 1 && col_idx < num_rhs) { - residual_norm_collection[row_idx * stride_residual_nc + col_idx] = - zero(); - } -} - - -// Must be called with at least `num_rows * num_rhs` threads in total. -template -__global__ __launch_bounds__(block_size) void initialize_2_2_kernel( - size_type num_rows, size_type num_rhs, - const ValueType *__restrict__ residual, size_type stride_residual, - const ValueType *__restrict__ residual_norm, - ValueType *__restrict__ residual_norm_collection, - ValueType *__restrict__ krylov_bases, size_type stride_krylov, - size_type *__restrict__ final_iter_nums) -{ - const auto global_id = blockIdx.x * blockDim.x + threadIdx.x; - const auto row_idx = global_id / num_rhs; - const auto col_idx = global_id % num_rhs; - - if (global_id < num_rhs) { - residual_norm_collection[global_id] = residual_norm[global_id]; - final_iter_nums[global_id] = 0; - } - - if (row_idx < num_rows && col_idx < num_rhs) { - krylov_bases[row_idx * stride_krylov + col_idx] = - residual[row_idx * stride_residual + col_idx] / - residual_norm[col_idx]; - } -} - - template void initialize_2(std::shared_ptr exec, const matrix::Dense *residual, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, matrix::Dense *krylov_bases, Array *final_iter_nums, size_type krylov_dim) @@ -191,16 +112,12 @@ void initialize_2(std::shared_ptr exec, const auto num_rows = residual->get_size()[0]; const auto num_rhs = residual->get_size()[1]; const dim3 grid_dim_1( - ceildiv(num_rows * krylov_bases->get_stride(), default_block_size), 1, - 1); + ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), + default_block_size), + 1, 1); const dim3 block_dim(default_block_size, 1, 1); constexpr auto block_size = default_block_size; - initialize_2_1_kernel<<>>( - residual->get_size()[0], residual->get_size()[1], krylov_dim, - as_cuda_type(krylov_bases->get_values()), krylov_bases->get_stride(), - as_cuda_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride()); residual->compute_norm2(residual_norm); const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1, @@ -217,200 +134,49 @@ void initialize_2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); -__global__ - __launch_bounds__(default_block_size) void increase_final_iteration_numbers_kernel( - size_type *__restrict__ final_iter_nums, - const stopping_status *__restrict__ stop_status, size_type total_number) -{ - const auto global_id = threadIdx.x + blockIdx.x * blockDim.x; - if (global_id < total_number) { - final_iter_nums[global_id] += - (1 - stop_status[global_id].has_stopped()); - } -} - - -template -__global__ __launch_bounds__(default_dot_size) void multidot_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType *__restrict__ next_krylov_basis, - size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases, - size_type stride_krylov, ValueType *__restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto tidy = threadIdx.y; - const auto col_idx = blockIdx.x * default_dot_dim + tidx; - const auto num = ceildiv(num_rows, gridDim.y); - const auto start_row = blockIdx.y * num; - const auto end_row = - ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ - UninitializedArray - reduction_helper_array; - ValueType *__restrict__ reduction_helper = reduction_helper_array; - - ValueType local_res = zero(); - const auto krylov_col = k * num_cols + col_idx; - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; - i += default_dot_dim) { - const auto next_krylov_idx = i * stride_next_krylov + col_idx; - const auto krylov_idx = i * stride_krylov + krylov_col; - local_res += - next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx]; - } - } - reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - __syncthreads(); - local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - const auto sum = - reduce(tile_block, local_res, - [](const ValueType &a, const ValueType &b) { return a + b; }); - const auto new_col_idx = blockIdx.x * default_dot_dim + tidy; - if (tidx == 0 && new_col_idx < num_cols && - !stop_status[new_col_idx].has_stopped()) { - const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; - atomic_add(hessenberg_iter + hessenberg_idx, sum); - } -} - - -// Must be called with at least `num_rows * stride_next_krylov` threads in -// total. -template -__global__ __launch_bounds__(block_size) void update_next_krylov_kernel( - size_type k, size_type num_rows, size_type num_cols, - ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov, - const ValueType *__restrict__ krylov_bases, size_type stride_krylov, - const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status *__restrict__ stop_status) -{ - const auto global_id = blockIdx.x * blockDim.x + threadIdx.x; - const auto row_idx = global_id / stride_next_krylov; - const auto col_idx = global_id % stride_next_krylov; - - if (row_idx < num_rows && col_idx < num_cols && - !stop_status[col_idx].has_stopped()) { - const auto next_krylov_idx = row_idx * stride_next_krylov + col_idx; - const auto krylov_idx = - row_idx * stride_krylov + k * num_cols + col_idx; - const auto hessenberg_idx = k * stride_hessenberg + col_idx; - - next_krylov_basis[next_krylov_idx] -= - hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx]; - } -} - - -// Must be called with at least `num_cols` blocks, each with `block_size` -// threads. `block_size` must be a power of 2. -template -__global__ __launch_bounds__(block_size) void update_hessenberg_2_kernel( - size_type iter, size_type num_rows, size_type num_cols, - const ValueType *__restrict__ next_krylov_basis, - size_type stride_next_krylov, ValueType *__restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto col_idx = blockIdx.x; - - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ UninitializedArray reduction_helper_array; - ValueType *__restrict__ reduction_helper = reduction_helper_array; - - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - ValueType local_res{}; - for (size_type i = tidx; i < num_rows; i += block_size) { - const auto next_krylov_idx = i * stride_next_krylov + col_idx; - const auto next_krylov_value = next_krylov_basis[next_krylov_idx]; - - local_res += next_krylov_value * next_krylov_value; - } - - reduction_helper[tidx] = local_res; - - // Perform thread block reduction. Result is in reduction_helper[0] - reduce(group::this_thread_block(), reduction_helper, - [](const ValueType &a, const ValueType &b) { return a + b; }); - - if (tidx == 0) { - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] = - sqrt(reduction_helper[0]); - } - } -} - - -// Must be called with at least `num_rows * stride_next_krylov` threads in -// total. -template -__global__ __launch_bounds__(block_size) void update_krylov_next_krylov_kernel( - size_type iter, size_type num_rows, size_type num_cols, - ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov, - ValueType *__restrict__ krylov_bases, size_type stride_krylov, - const ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status *__restrict__ stop_status) -{ - const auto global_id = threadIdx.x + blockIdx.x * blockDim.x; - const auto row_idx = global_id / stride_next_krylov; - const auto col_idx = global_id % stride_next_krylov; - const auto hessenberg = - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]; - - if (row_idx < num_rows && col_idx < num_cols && - !stop_status[col_idx].has_stopped()) { - const auto next_krylov_idx = row_idx * stride_next_krylov + col_idx; - const auto krylov_idx = - row_idx * stride_krylov + num_cols * (iter + 1) + col_idx; - - const auto next_krylov_value = - next_krylov_basis[next_krylov_idx] / hessenberg; - - next_krylov_basis[next_krylov_idx] = next_krylov_value; - krylov_bases[krylov_idx] = next_krylov_value; - } -} - - template void finish_arnoldi(std::shared_ptr exec, - matrix::Dense *next_krylov_basis, - matrix::Dense *krylov_bases, + size_type num_rows, matrix::Dense *krylov_bases, matrix::Dense *hessenberg_iter, size_type iter, const stopping_status *stop_status) { - const auto stride_next_krylov = next_krylov_basis->get_stride(); const auto stride_krylov = krylov_bases->get_stride(); const auto stride_hessenberg = hessenberg_iter->get_stride(); - const auto dim_size = next_krylov_basis->get_size(); auto cublas_handle = exec->get_cublas_handle(); - const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); + const dim3 grid_size( + ceildiv(hessenberg_iter->get_size()[1], default_dot_dim), + exec->get_num_multiprocessor() * 2); const dim3 block_size(default_dot_dim, default_dot_dim); + auto next_krylov_basis = + krylov_bases->get_values() + + (iter + 1) * num_rows * hessenberg_iter->get_size()[1]; for (size_type k = 0; k < iter + 1; ++k) { - zero_array(dim_size[1], - hessenberg_iter->get_values() + k * stride_hessenberg); - multidot_kernel<<>>( - k, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_const_values()), - stride_next_krylov, as_cuda_type(krylov_bases->get_const_values()), - stride_krylov, as_cuda_type(hessenberg_iter->get_values()), - stride_hessenberg, as_cuda_type(stop_status)); + const auto k_krylov_bases = + krylov_bases->get_const_values() + + k * num_rows * hessenberg_iter->get_size()[1]; + if (hessenberg_iter->get_size()[1] > 1) { + // TODO: this condition should be tuned + // single rhs will use vendor's dot, otherwise, use our own + // multidot_kernel which parallelize multiple rhs. + components::fill_array( + exec, hessenberg_iter->get_values() + k * stride_hessenberg, + hessenberg_iter->get_size()[1], zero()); + multidot_kernel<<>>( + k, num_rows, hessenberg_iter->get_size()[1], + as_cuda_type(k_krylov_bases), as_cuda_type(next_krylov_basis), + stride_krylov, as_cuda_type(hessenberg_iter->get_values()), + stride_hessenberg, as_cuda_type(stop_status)); + } else { + cublas::dot(exec->get_cublas_handle(), num_rows, k_krylov_bases, + stride_krylov, next_krylov_basis, stride_krylov, + hessenberg_iter->get_values() + k * stride_hessenberg); + } update_next_krylov_kernel - <<>>( - k, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_values()), - stride_next_krylov, - as_cuda_type(krylov_bases->get_const_values()), stride_krylov, + k, num_rows, hessenberg_iter->get_size()[1], + as_cuda_type(k_krylov_bases), as_cuda_type(next_krylov_basis), + stride_krylov, as_cuda_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_cuda_type(stop_status)); } @@ -421,156 +187,32 @@ void finish_arnoldi(std::shared_ptr exec, update_hessenberg_2_kernel - <<>>( - iter, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_const_values()), - stride_next_krylov, as_cuda_type(hessenberg_iter->get_values()), - stride_hessenberg, as_cuda_type(stop_status)); - - update_krylov_next_krylov_kernel - <<get_size()[1], default_block_size>>>( + iter, num_rows, hessenberg_iter->get_size()[1], + as_cuda_type(next_krylov_basis), stride_krylov, + as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, + as_cuda_type(stop_status)); + + update_krylov_kernel + <<>>( - iter, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_values()), stride_next_krylov, - as_cuda_type(krylov_bases->get_values()), stride_krylov, + iter, num_rows, hessenberg_iter->get_size()[1], + as_cuda_type(next_krylov_basis), stride_krylov, as_cuda_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_cuda_type(stop_status)); // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis // End of arnoldi } -template -__device__ void calculate_sin_and_cos_kernel( - size_type col_idx, size_type num_cols, size_type iter, - const ValueType *hessenberg_iter, size_type stride_hessenberg, - ValueType *givens_sin, size_type stride_sin, ValueType *givens_cos, - size_type stride_cos) -{ - if (hessenberg_iter[iter * stride_hessenberg + col_idx] == - zero()) { - givens_cos[iter * stride_cos + col_idx] = zero(); - givens_sin[iter * stride_sin + col_idx] = one(); - } else { - auto hypotenuse = - sqrt(hessenberg_iter[iter * stride_hessenberg + col_idx] * - hessenberg_iter[iter * stride_hessenberg + col_idx] + - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] * - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]); - givens_cos[iter * stride_cos + col_idx] = - abs(hessenberg_iter[iter * stride_hessenberg + col_idx]) / - hypotenuse; - givens_sin[iter * stride_sin + col_idx] = - givens_cos[iter * stride_cos + col_idx] * - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] / - hessenberg_iter[iter * stride_hessenberg + col_idx]; - } -} - - -template -__device__ void calculate_residual_norm_kernel( - size_type col_idx, size_type num_cols, size_type iter, - const ValueType *givens_sin, size_type stride_sin, - const ValueType *givens_cos, size_type stride_cos, ValueType *residual_norm, - ValueType *residual_norm_collection, - size_type stride_residual_norm_collection, const ValueType *b_norm) -{ - residual_norm_collection[(iter + 1) * stride_residual_norm_collection + - col_idx] = - -givens_sin[iter * stride_sin + col_idx] * - residual_norm_collection[iter * stride_residual_norm_collection + - col_idx]; - residual_norm_collection[iter * stride_residual_norm_collection + col_idx] = - givens_cos[iter * stride_cos + col_idx] * - residual_norm_collection[iter * stride_residual_norm_collection + - col_idx]; - residual_norm[col_idx] = - abs(residual_norm_collection[(iter + 1) * - stride_residual_norm_collection + - col_idx]) / - b_norm[col_idx]; -} - - -// Must be called with at least `num_cols` threads in total. -template -__global__ __launch_bounds__(block_size) void givens_rotation_kernel( - size_type num_rows, size_type num_cols, size_type iter, - ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, - ValueType *__restrict__ givens_sin, size_type stride_sin, - ValueType *__restrict__ givens_cos, size_type stride_cos, - ValueType *__restrict__ residual_norm, - ValueType *__restrict__ residual_norm_collection, - size_type stride_residual_norm_collection, - const ValueType *__restrict__ b_norm, - const stopping_status *__restrict__ stop_status) -{ - const auto col_idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (col_idx >= num_cols || stop_status[col_idx].has_stopped()) { - return; - } - - const auto current_thread_block = group::this_thread_block(); - - for (size_type i = 0; i < iter; ++i) { - const auto tmp = - givens_cos[i * stride_cos + col_idx] * - hessenberg_iter[i * stride_hessenberg + col_idx] + - givens_sin[i * stride_sin + col_idx] * - hessenberg_iter[(i + 1) * stride_hessenberg + col_idx]; - current_thread_block.sync(); - hessenberg_iter[(i + 1) * stride_hessenberg + col_idx] = - givens_cos[i * stride_cos + col_idx] * - hessenberg_iter[(i + 1) * stride_hessenberg + col_idx] - - givens_sin[i * stride_sin + col_idx] * - hessenberg_iter[i * stride_hessenberg + col_idx]; - hessenberg_iter[i * stride_hessenberg + col_idx] = tmp; - current_thread_block.sync(); - } - // for j in 1:iter - 1 - // temp = cos(j)*hessenberg(j) + - // sin(j)*hessenberg(j+1) - // hessenberg(j+1) = -sin(j)*hessenberg(j) + - // cos(j)*hessenberg(j+1) - // hessenberg(j) = temp; - // end - - calculate_sin_and_cos_kernel(col_idx, num_cols, iter, hessenberg_iter, - stride_hessenberg, givens_sin, stride_sin, - givens_cos, stride_cos); - // Calculate sin and cos - - hessenberg_iter[iter * stride_hessenberg + col_idx] = - givens_cos[iter * stride_cos + col_idx] * - hessenberg_iter[iter * stride_hessenberg + col_idx] + - givens_sin[iter * stride_sin + col_idx] * - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]; - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] = - zero(); - // hessenberg(iter) = cos(iter)*hessenberg(iter) + - // sin(iter)*hessenberg(iter) - // hessenberg(iter+1) = 0 - - calculate_residual_norm_kernel(col_idx, num_cols, iter, givens_sin, - stride_sin, givens_cos, stride_cos, - residual_norm, residual_norm_collection, - stride_residual_norm_collection, b_norm); - // Calculate residual norm -} - - template void givens_rotation(std::shared_ptr exec, matrix::Dense *givens_sin, matrix::Dense *givens_cos, matrix::Dense *hessenberg_iter, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, - const matrix::Dense *b_norm, size_type iter, - const Array *stop_status) + size_type iter, const Array *stop_status) { // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; @@ -587,21 +229,18 @@ void givens_rotation(std::shared_ptr exec, givens_cos->get_stride(), as_cuda_type(residual_norm->get_values()), as_cuda_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride(), - as_cuda_type(b_norm->get_const_values()), as_cuda_type(stop_status->get_const_data())); } template -void step_1(std::shared_ptr exec, - matrix::Dense *next_krylov_basis, +void step_1(std::shared_ptr exec, size_type num_rows, matrix::Dense *givens_sin, matrix::Dense *givens_cos, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, matrix::Dense *krylov_bases, - matrix::Dense *hessenberg_iter, - const matrix::Dense *b_norm, size_type iter, + matrix::Dense *hessenberg_iter, size_type iter, Array *final_iter_nums, const Array *stop_status) { @@ -611,75 +250,15 @@ void step_1(std::shared_ptr exec, default_block_size>>>(as_cuda_type(final_iter_nums->get_data()), as_cuda_type(stop_status->get_const_data()), final_iter_nums->get_num_elems()); - finish_arnoldi(exec, next_krylov_basis, krylov_bases, hessenberg_iter, iter, + finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter, stop_status->get_const_data()); givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, b_norm, iter, - stop_status); + residual_norm, residual_norm_collection, iter, stop_status); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); -// Must be called with at least `num_rhs` threads in total. -template -__global__ __launch_bounds__(block_size) void solve_upper_triangular_kernel( - size_type num_cols, size_type num_rhs, - const ValueType *__restrict__ residual_norm_collection, - size_type stride_residual_norm_collection, - const ValueType *__restrict__ hessenberg, size_type stride_hessenberg, - ValueType *__restrict__ y, size_type stride_y, - const size_type *__restrict__ final_iter_nums) -{ - const auto col_idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (col_idx >= num_rhs) { - return; - } - - for (int i = final_iter_nums[col_idx] - 1; i >= 0; --i) { - auto temp = - residual_norm_collection[i * stride_residual_norm_collection + - col_idx]; - for (size_type j = i + 1; j < final_iter_nums[col_idx]; ++j) { - temp -= hessenberg[i * stride_hessenberg + j * num_rhs + col_idx] * - y[j * stride_y + col_idx]; - } - - y[i * stride_y + col_idx] = - temp / hessenberg[i * stride_hessenberg + i * num_rhs + col_idx]; - } - // Solve upper triangular. - // y = hessenberg \ residual_norm_collection -} - - -// Must be called with at least `stride_preconditioner * num_rows` threads in -// total. -template -__global__ __launch_bounds__(block_size) void calculate_Qy_kernel( - size_type num_rows, size_type num_cols, size_type num_rhs, - const ValueType *__restrict__ krylov_bases, size_type stride_krylov, - const ValueType *__restrict__ y, size_type stride_y, - ValueType *__restrict__ before_preconditioner, - size_type stride_preconditioner, - const size_type *__restrict__ final_iter_nums) -{ - const auto global_id = blockIdx.x * blockDim.x + threadIdx.x; - const auto row_id = global_id / stride_preconditioner; - const auto col_id = global_id % stride_preconditioner; - - if (row_id < num_rows && col_id < num_cols) { - before_preconditioner[global_id] = zero(); - for (size_type j = 0; j < final_iter_nums[col_id]; ++j) { - before_preconditioner[global_id] += - krylov_bases[row_id * stride_krylov + j * num_rhs + col_id] * - y[j * stride_y + col_id]; - } - } -} - - template void solve_upper_triangular( const matrix::Dense *residual_norm_collection, diff --git a/cuda/solver/ir_kernels.cu b/cuda/solver/ir_kernels.cu index e1f0dbcdc46..7b26ab3527f 100644 --- a/cuda/solver/ir_kernels.cu +++ b/cuda/solver/ir_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,6 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/components/thread_ids.cuh" + + namespace gko { namespace kernels { namespace cuda { @@ -50,16 +53,7 @@ namespace ir { constexpr int default_block_size = 512; -__global__ __launch_bounds__(default_block_size) void initialize_kernel( - size_type num_cols, stopping_status *stop_status) -{ - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - - if (tidx < num_cols) { - stop_status[tidx].reset(); - } -} +#include "common/solver/ir_kernels.hpp.inc" void initialize(std::shared_ptr exec, diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index 5f6549884d9..1cd2764d481 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45,7 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/solver/lower_trs_kernels.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index 083db4a94b2..0518b11bed7 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45,7 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/solver/upper_trs_kernels.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index a274bd95021..390f96cb2f2 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -60,8 +61,7 @@ __global__ __launch_bounds__(default_block_size) void set_all_statuses( size_type num_elems, uint8 stoppingId, bool setFinalized, stopping_status *stop_status) { - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; + const auto tidx = thread::get_thread_id_flat(); if (tidx < num_elems) { stop_status[tidx].stop(stoppingId, setFinalized); } diff --git a/cuda/stop/residual_norm_reduction_kernels.cu b/cuda/stop/residual_norm_kernels.cu similarity index 63% rename from cuda/stop/residual_norm_reduction_kernels.cu rename to cuda/stop/residual_norm_kernels.cu index 189f2269152..45f2c2336d5 100644 --- a/cuda/stop/residual_norm_reduction_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,44 +30,44 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/stop/residual_norm_reduction_kernels.hpp" +#include "core/stop/residual_norm_kernels.hpp" #include #include -#include +#include #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/thread_ids.cuh" + namespace gko { namespace kernels { namespace cuda { /** - * @brief The Residual norm reduction stopping criterion namespace. + * @brief The Residual norm stopping criterion namespace. * @ref resnorm * @ingroup resnorm */ -namespace residual_norm_reduction { +namespace residual_norm { constexpr int default_block_size = 512; template -__global__ - __launch_bounds__(default_block_size) void residual_norm_reduction_kernel( - size_type num_cols, remove_complex rel_residual_goal, - const ValueType *__restrict__ tau, - const ValueType *__restrict__ orig_tau, uint8 stoppingId, - bool setFinalized, stopping_status *__restrict__ stop_status, - bool *__restrict__ device_storage) +__global__ __launch_bounds__(default_block_size) void residual_norm_kernel( + size_type num_cols, ValueType rel_residual_goal, + const ValueType *__restrict__ tau, const ValueType *__restrict__ orig_tau, + uint8 stoppingId, bool setFinalized, + stopping_status *__restrict__ stop_status, + bool *__restrict__ device_storage) { - const auto tidx = - static_cast(blockDim.x) * blockIdx.x + threadIdx.x; + const auto tidx = thread::get_thread_id_flat(); if (tidx < num_cols) { - if (abs(tau[tidx]) < rel_residual_goal * abs(orig_tau[tidx])) { + if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) { stop_status[tidx].converge(stoppingId, setFinalized); device_storage[1] = true; } @@ -89,21 +89,22 @@ __global__ __launch_bounds__(1) void init_kernel( template -void residual_norm_reduction(std::shared_ptr exec, - const matrix::Dense *tau, - const matrix::Dense *orig_tau, - remove_complex rel_residual_goal, - uint8 stoppingId, bool setFinalized, - Array *stop_status, - Array *device_storage, bool *all_converged, - bool *one_changed) +void residual_norm(std::shared_ptr exec, + const matrix::Dense *tau, + const matrix::Dense *orig_tau, + ValueType rel_residual_goal, uint8 stoppingId, + bool setFinalized, Array *stop_status, + Array *device_storage, bool *all_converged, + bool *one_changed) { + static_assert(is_complex_s::value == false, + "ValueType must not be complex in this function!"); init_kernel<<<1, 1>>>(as_cuda_type(device_storage->get_data())); const dim3 block_size(default_block_size, 1, 1); const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); - residual_norm_reduction_kernel<<>>( + residual_norm_kernel<<>>( tau->get_size()[1], rel_residual_goal, as_cuda_type(tau->get_const_values()), as_cuda_type(orig_tau->get_const_values()), stoppingId, setFinalized, @@ -111,17 +112,15 @@ void residual_norm_reduction(std::shared_ptr exec, as_cuda_type(device_storage->get_data())); /* Represents all_converged, one_changed */ - bool tmp[2] = {true, false}; - exec->get_master()->copy_from(exec.get(), 2, - device_storage->get_const_data(), tmp); - *all_converged = tmp[0]; - *one_changed = tmp[1]; + *all_converged = exec->copy_val_to_host(device_storage->get_const_data()); + *one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( + GKO_DECLARE_RESIDUAL_NORM_KERNEL); -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/test/CMakeLists.txt b/cuda/test/CMakeLists.txt index 44d4194c226..5b180f32c11 100644 --- a/cuda/test/CMakeLists.txt +++ b/cuda/test/CMakeLists.txt @@ -1,4 +1,7 @@ +include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake) + add_subdirectory(base) +add_subdirectory(components) add_subdirectory(factorization) add_subdirectory(matrix) add_subdirectory(preconditioner) diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index fe40d1ee3aa..2bcf5961bbd 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -43,25 +44,36 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/test/utils.hpp" + + namespace { class ExampleOperation : public gko::Operation { public: explicit ExampleOperation(int &val) : value(val) {} + void run(std::shared_ptr) const override { value = -1; } - void run(std::shared_ptr cuda) const override - { - cudaGetDevice(&value); - } + void run(std::shared_ptr) const override { value = -2; } + void run(std::shared_ptr) const override + { + value = -3; + } + + void run(std::shared_ptr) const override + { + cudaGetDevice(&value); + } + int &value; }; @@ -107,7 +119,10 @@ TEST_F(CudaExecutor, MasterKnowsNumberOfDevices) { int count = 0; cudaGetDeviceCount(&count); - ASSERT_EQ(count, gko::CudaExecutor::get_num_devices()); + + auto num_devices = gko::CudaExecutor::get_num_devices(); + + ASSERT_EQ(count, num_devices); } @@ -175,6 +190,7 @@ TEST_F(CudaExecutor, CopiesDataFromCuda) cuda->free(orig); } + /* Properly checks if it works only when multiple GPUs exist */ TEST_F(CudaExecutor, PreservesDeviceSettings) { @@ -190,14 +206,18 @@ TEST_F(CudaExecutor, PreservesDeviceSettings) ASSERT_EQ(current_device, previous_device); } + TEST_F(CudaExecutor, RunsOnProperDevice) { int value = -1; + GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(0)); cuda2->run(ExampleOperation(value)); + ASSERT_EQ(value, cuda2->get_device_id()); } + TEST_F(CudaExecutor, CopiesDataFromCudaToCuda) { int copy[2]; @@ -215,15 +235,15 @@ TEST_F(CudaExecutor, CopiesDataFromCudaToCuda) GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(0)); cuda2->run(ExampleOperation(value)); ASSERT_EQ(value, cuda2->get_device_id()); - + // Put the results on OpenMP and run CPU side assertions omp->copy_from(cuda2.get(), 2, copy_cuda2, copy); - EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); cuda->free(copy_cuda2); cuda->free(orig); } + TEST_F(CudaExecutor, Synchronizes) { // Todo design a proper unit test once we support streams diff --git a/cuda/test/base/exception_helpers.cu b/cuda/test/base/exception_helpers.cu index 2d9e95ddf9b..1652594803a 100644 --- a/cuda/test/base/exception_helpers.cu +++ b/cuda/test/base/exception_helpers.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,14 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - #include #include #include +#include + + namespace { diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index a4d84e2b736..08deb9a29d9 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -47,50 +47,66 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/test/utils.hpp" namespace { +namespace kernel { -template -__global__ void test_real_isfinite(bool *result) +template +__device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = INFINITY; + constexpr T inf = gko::device_numeric_limits::inf; + constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; - test_true = - gko::isfinite(T{0}) && gko::isfinite(-T{0}) && gko::isfinite(T{1}); - test_false = gko::isfinite(inf) || gko::isfinite(-inf) || - gko::isfinite(NAN) || gko::isfinite(inf - inf) || - gko::isfinite(inf / inf) || gko::isfinite(inf * T{2}) || - gko::isfinite(T{1} / T{0}) || gko::isfinite(T{0} / T{0}); - *result = test_true && !test_false; + test_true = isfin(T{0}) && isfin(-T{0}) && isfin(T{1}); + test_false = isfin(inf) || isfin(-inf) || isfin(quiet_nan) || + isfin(inf - inf) || isfin(inf / inf) || isfin(inf * T{2}) || + isfin(T{1} / T{0}) || isfin(T{0} / T{0}); + return test_true && !test_false; } -template -__global__ void test_complex_isfinite(bool *result) +template +__device__ bool test_complex_is_finite_function(FuncType isfin) { static_assert(gko::is_complex_s::value, "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::cuda::cuda_type; - constexpr T inf = INFINITY; + constexpr T inf = gko::device_numeric_limits::inf; constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; - test_true = gko::isfinite(c_type{T{0}, T{0}}) && - gko::isfinite(c_type{-T{0}, -T{0}}) && - gko::isfinite(c_type{T{1}, T{0}}) && - gko::isfinite(c_type{T{0}, T{1}}); - test_false = - gko::isfinite(c_type{inf, T{0}}) || gko::isfinite(c_type{-inf, T{0}}) || - gko::isfinite(c_type{quiet_nan, T{0}}) || - gko::isfinite(c_type{T{0}, inf}) || gko::isfinite(c_type{T{0}, -inf}) || - gko::isfinite(c_type{T{0}, quiet_nan}); - *result = test_true && !test_false; + test_true = isfin(c_type{T{0}, T{0}}) && isfin(c_type{-T{0}, -T{0}}) && + isfin(c_type{T{1}, T{0}}) && isfin(c_type{T{0}, T{1}}); + test_false = isfin(c_type{inf, T{0}}) || isfin(c_type{-inf, T{0}}) || + isfin(c_type{quiet_nan, T{0}}) || isfin(c_type{T{0}, inf}) || + isfin(c_type{T{0}, -inf}) || isfin(c_type{T{0}, quiet_nan}); + return test_true && !test_false; +} + + +} // namespace kernel + + +template +__global__ void test_real_is_finite(bool *result) +{ + *result = kernel::test_real_is_finite_function( + [](T val) { return gko::is_finite(val); }); +} + + +template +__global__ void test_complex_is_finite(bool *result) +{ + *result = kernel::test_complex_is_finite_function( + [](ComplexType val) { return gko::is_finite(val); }); } @@ -102,19 +118,19 @@ protected: {} template - bool test_real_isfinite_kernel() + bool test_real_is_finite_kernel() { gko::Array result(cuda, 1); - test_real_isfinite<<<1, 1>>>(result.get_data()); + test_real_is_finite<<<1, 1>>>(result.get_data()); result.set_executor(ref); return *result.get_data(); } template - bool test_complex_isfinite_kernel() + bool test_complex_is_finite_kernel() { gko::Array result(cuda, 1); - test_complex_isfinite<<<1, 1>>>(result.get_data()); + test_complex_is_finite<<<1, 1>>>(result.get_data()); result.set_executor(ref); return *result.get_data(); } @@ -124,21 +140,21 @@ protected: }; -TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_isfinite_kernel()); } +TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel()); } -TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_isfinite_kernel()); } +TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel()); } TEST_F(IsFinite, FloatComplex) { - ASSERT_TRUE(test_complex_isfinite_kernel>()); + ASSERT_TRUE(test_complex_is_finite_kernel>()); } TEST_F(IsFinite, DoubleComplex) { - ASSERT_TRUE(test_complex_isfinite_kernel>()); + ASSERT_TRUE(test_complex_is_finite_kernel>()); } diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt new file mode 100644 index 00000000000..154a39e963e --- /dev/null +++ b/cuda/test/components/CMakeLists.txt @@ -0,0 +1,7 @@ +ginkgo_create_cuda_test(cooperative_groups_kernels) +ginkgo_create_cuda_test(merging_kernels) +ginkgo_create_cuda_test(searching_kernels) +ginkgo_create_cuda_test(sorting_kernels) +ginkgo_create_test(fill_array) +ginkgo_create_test(precision_conversion) +ginkgo_create_test(prefix_sum) diff --git a/cuda/test/components/cooperative_groups_kernels.cu b/cuda/test/components/cooperative_groups_kernels.cu new file mode 100644 index 00000000000..e565a6c9952 --- /dev/null +++ b/cuda/test/components/cooperative_groups_kernels.cu @@ -0,0 +1,262 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/components/cooperative_groups.cuh" + + +#include + + +#include + + +#include +#include + + +#include "cuda/base/config.hpp" +#include "cuda/test/utils.hpp" + + +namespace { + + +using namespace gko::kernels::cuda; + + +class CooperativeGroups : public ::testing::Test { +protected: + CooperativeGroups() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)), + result(ref, 1), + dresult(cuda) + { + *result.get_data() = true; + dresult = result; + } + + template + void test(Kernel kernel) + { + kernel<<<1, config::warp_size>>>(dresult.get_data()); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + template + void test_subwarp(Kernel kernel) + { + kernel<<<1, config::warp_size / 2>>>(dresult.get_data()); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + gko::Array result; + gko::Array dresult; +}; + + +constexpr static int subwarp_size = config::warp_size / 4; + + +__device__ void test_assert(bool *success, bool partial) +{ + if (!partial) { + *success = false; + } +} + + +__global__ void cg_shuffle(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = int(group.thread_rank()); + test_assert(s, group.shfl_up(i, 1) == max(0, i - 1)); + test_assert(s, group.shfl_down(i, 1) == min(i + 1, config::warp_size - 1)); + test_assert(s, group.shfl(i, 0) == 0); +} + +TEST_F(CooperativeGroups, Shuffle) { test(cg_shuffle); } + + +__global__ void cg_all(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, group.all(true)); + test_assert(s, !group.all(false)); + test_assert(s, !group.all(threadIdx.x < 13)); +} + +TEST_F(CooperativeGroups, All) { test(cg_all); } + + +__global__ void cg_any(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, group.any(true)); + test_assert(s, group.any(threadIdx.x == 0)); + test_assert(s, !group.any(false)); +} + +TEST_F(CooperativeGroups, Any) { test(cg_any); } + + +__global__ void cg_ballot(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, group.ballot(false) == 0); + test_assert(s, group.ballot(true) == ~config::lane_mask_type{}); + test_assert(s, group.ballot(threadIdx.x < 4) == 0xf); +} + +TEST_F(CooperativeGroups, Ballot) { test(cg_ballot); } + + +__global__ void cg_subwarp_shuffle(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = int(group.thread_rank()); + test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0)); + test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1)); + auto group_base = threadIdx.x / subwarp_size * subwarp_size; + test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base); + if (threadIdx.x / subwarp_size == 1) { + test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0)); + test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1)); + test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base); + } else { + test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1)); + test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base); + test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0)); + } +} + +TEST_F(CooperativeGroups, SubwarpShuffle) { test(cg_subwarp_shuffle); } + +TEST_F(CooperativeGroups, SubwarpShuffle2) { test_subwarp(cg_subwarp_shuffle); } + + +__global__ void cg_subwarp_all(bool *s) +{ + auto grp = threadIdx.x / subwarp_size; + bool test_grp = grp == 1; + auto i = threadIdx.x % subwarp_size; + // only test with test_grp, the other threads run 'interference' + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, !test_grp || group.all(test_grp)); + test_assert(s, !test_grp || !group.all(!test_grp)); + test_assert(s, !test_grp || !group.all(i < subwarp_size - 3 || !test_grp)); + if (test_grp) { + test_assert(s, group.all(true)); + test_assert(s, !group.all(false)); + test_assert(s, !group.all(i < subwarp_size - 3)); + } else { + test_assert(s, !group.all(false)); + test_assert(s, !group.all(i < subwarp_size - 3)); + test_assert(s, group.all(true)); + } +} + +TEST_F(CooperativeGroups, SubwarpAll) { test(cg_subwarp_all); } + +TEST_F(CooperativeGroups, SubwarpAll2) { test_subwarp(cg_subwarp_all); } + + +__global__ void cg_subwarp_any(bool *s) +{ + auto grp = threadIdx.x / subwarp_size; + bool test_grp = grp == 1; + // only test with test_grp, the other threads run 'interference' + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = group.thread_rank(); + test_assert(s, !test_grp || group.any(test_grp)); + test_assert(s, !test_grp || group.any(test_grp && i == 1)); + test_assert(s, !test_grp || !group.any(!test_grp)); + if (test_grp) { + test_assert(s, group.any(true)); + test_assert(s, group.any(i == 1)); + test_assert(s, !group.any(false)); + } else { + test_assert(s, !group.any(false)); + test_assert(s, group.any(true)); + test_assert(s, group.any(i == 1)); + } +} + +TEST_F(CooperativeGroups, SubwarpAny) { test(cg_subwarp_any); } + +TEST_F(CooperativeGroups, SubwarpAny2) { test_subwarp(cg_subwarp_any); } + + +__global__ void cg_subwarp_ballot(bool *s) +{ + auto grp = threadIdx.x / subwarp_size; + bool test_grp = grp == 1; + auto full_mask = (config::lane_mask_type{1} << subwarp_size) - 1; + // only test with test_grp, the other threads run 'interference' + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = group.thread_rank(); + test_assert(s, !test_grp || group.ballot(!test_grp) == 0); + test_assert(s, !test_grp || group.ballot(test_grp) == full_mask); + test_assert(s, !test_grp || group.ballot(i < 4 || !test_grp) == 0xf); + if (test_grp) { + test_assert(s, group.ballot(false) == 0); + test_assert(s, group.ballot(true) == full_mask); + test_assert(s, group.ballot(i < 4) == 0xf); + } else { + test_assert(s, group.ballot(true) == full_mask); + test_assert(s, group.ballot(i < 4) == 0xf); + test_assert(s, group.ballot(false) == 0); + } +} + +TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); } + +TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); } + + +} // namespace diff --git a/cuda/test/components/fill_array.cpp b/cuda/test/components/fill_array.cpp new file mode 100644 index 00000000000..f5a1f8734f1 --- /dev/null +++ b/cuda/test/components/fill_array.cpp @@ -0,0 +1,82 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/fill_array.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils/assertions.hpp" + + +namespace { + + +class FillArray : public ::testing::Test { +protected: + using value_type = double; + FillArray() + : ref(gko::ReferenceExecutor::create()), + exec(gko::CudaExecutor::create(0, ref)), + total_size(6344), + vals(ref, total_size), + dvals(exec, total_size) + { + std::fill_n(vals.get_data(), total_size, 1234.0); + } + + std::shared_ptr ref; + std::shared_ptr exec; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + + +TEST_F(FillArray, EqualsReference) +{ + gko::kernels::cuda::components::fill_array(exec, dvals.get_data(), + total_size, 1234.0); + GKO_ASSERT_ARRAY_EQ(vals, dvals); +} + + +} // namespace diff --git a/cuda/test/components/merging_kernels.cu b/cuda/test/components/merging_kernels.cu new file mode 100644 index 00000000000..abd135b4d65 --- /dev/null +++ b/cuda/test/components/merging_kernels.cu @@ -0,0 +1,295 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/components/merging.cuh" + + +#include +#include +#include +#include + + +#include + + +#include +#include + + +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/test/utils.hpp" + + +namespace { + + +using namespace gko::kernels::cuda; +using namespace cooperative_groups; + + +class Merging : public ::testing::Test { +protected: + Merging() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)), + rng(123456), + rng_runs{100}, + max_size{1637}, + sizes{0, 1, 2, 3, 4, 10, 15, 16, + 31, 34, 102, 242, 534, 956, 1239, 1637}, + data1(ref, max_size), + data2(ref, max_size), + outdata(ref, 2 * max_size), + idxs1(ref), + idxs2(ref), + idxs3(ref), + refidxs1(ref), + refidxs2(ref), + refidxs3(ref), + refdata(ref, 2 * max_size), + ddata1(cuda), + ddata2(cuda), + didxs1(cuda, 2 * max_size), + didxs2(cuda, 2 * max_size), + didxs3(cuda, 2 * max_size), + drefidxs1(cuda, 2 * max_size), + drefidxs2(cuda, 2 * max_size), + drefidxs3(cuda, 2 * max_size), + doutdata(cuda, 2 * max_size) + {} + + void init_data(int rng_run) + { + std::uniform_int_distribution dist(0, max_size); + std::fill_n(data1.get_data(), max_size, 0); + std::fill_n(data2.get_data(), max_size, 0); + for (int i = 0; i < max_size; ++i) { + // here we also want to test some corner cases + // first two runs: zero data1 + if (rng_run > 1) data1.get_data()[i] = dist(rng); + // first and third run: zero data2 + if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng); + } + std::sort(data1.get_data(), data1.get_data() + max_size); + std::sort(data2.get_data(), data2.get_data() + max_size); + + ddata1 = data1; + ddata2 = data2; + } + + void assert_eq_ref(int size, int eq_size) + { + outdata = doutdata; + auto out_ptr = outdata.get_const_data(); + auto out_end = out_ptr + eq_size; + auto ref_ptr = refdata.get_data(); + std::copy_n(data1.get_const_data(), size, ref_ptr); + std::copy_n(data2.get_const_data(), size, ref_ptr + size); + std::sort(ref_ptr, ref_ptr + 2 * size); + + ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr)); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + std::default_random_engine rng; + + int rng_runs; + int max_size; + std::vector sizes; + gko::Array data1; + gko::Array data2; + gko::Array idxs1; + gko::Array idxs2; + gko::Array idxs3; + gko::Array refidxs1; + gko::Array refidxs2; + gko::Array refidxs3; + gko::Array outdata; + gko::Array refdata; + gko::Array ddata1; + gko::Array ddata2; + gko::Array didxs1; + gko::Array didxs2; + gko::Array didxs3; + gko::Array drefidxs1; + gko::Array drefidxs2; + gko::Array drefidxs3; + gko::Array doutdata; +}; + + +__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + auto i = warp.thread_rank(); + auto result = group_merge_step(a[i], b[i], warp); + c[i] = min(result.a_val, result.b_val); +} + +TEST_F(Merging, MergeStep) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + test_merge_step<<<1, config::warp_size>>>(ddata1.get_const_data(), + ddata2.get_const_data(), + doutdata.get_data()); + + assert_eq_ref(config::warp_size, config::warp_size); + } +} + + +__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + c[i] = min(a_val, b_val); + } + return true; + }); +} + +TEST_F(Merging, FullMerge) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + for (auto size : sizes) { + test_merge<<<1, config::warp_size>>>(ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c) +{ + sequential_merge( + a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + return true; + }); +} + +TEST_F(Merging, SequentialFullMerge) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + for (auto size : sizes) { + test_sequential_merge<<<1, 1>>>(ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c, gko::int32 *aidxs, + gko::int32 *bidxs, gko::int32 *cidxs, + gko::int32 *refaidxs, gko::int32 *refbidxs, + gko::int32 *refcidxs) +{ + if (threadIdx.x == 0) { + sequential_merge(a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i) { + refaidxs[i] = a_idx; + refbidxs[i] = b_idx; + refcidxs[i] = i; + return true; + }); + } + auto warp = tiled_partition(this_thread_block()); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + aidxs[i] = a_idx; + bidxs[i] = b_idx; + cidxs[i] = i; + c[i] = min(a_val, b_val); + } + return true; + }); +} + +TEST_F(Merging, FullMergeIdxs) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + for (auto size : sizes) { + test_merge_idxs<<<1, config::warp_size>>>( + ddata1.get_const_data(), ddata2.get_const_data(), size, + doutdata.get_data(), didxs1.get_data(), didxs2.get_data(), + didxs3.get_data(), drefidxs1.get_data(), drefidxs2.get_data(), + drefidxs3.get_data()); + + assert_eq_ref(size, 2 * size); + idxs1 = didxs1; + idxs2 = didxs2; + idxs3 = didxs3; + refidxs1 = drefidxs1; + refidxs2 = drefidxs2; + refidxs3 = drefidxs3; + auto idxs1_ptr = idxs1.get_const_data(); + auto idxs2_ptr = idxs2.get_const_data(); + auto idxs3_ptr = idxs3.get_const_data(); + auto refidxs1_ptr = refidxs1.get_const_data(); + auto refidxs2_ptr = refidxs2.get_const_data(); + auto refidxs3_ptr = refidxs3.get_const_data(); + + ASSERT_TRUE( + std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr)); + ASSERT_TRUE( + std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr)); + ASSERT_TRUE( + std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr)); + } + } +} + + +} // namespace diff --git a/cuda/test/components/precision_conversion.cpp b/cuda/test/components/precision_conversion.cpp new file mode 100644 index 00000000000..73751dbc1d9 --- /dev/null +++ b/cuda/test/components/precision_conversion.cpp @@ -0,0 +1,173 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include +#include +#include + + +#include + + +#include + + +#include "cuda/test/utils.hpp" + + +namespace { + + +class PrecisionConversion : public ::testing::Test { +protected: + PrecisionConversion() + : ref(gko::ReferenceExecutor::create()), + exec(gko::CudaExecutor::create(0, ref)), + rand(293), + total_size(42793), + vals(ref, total_size), + cvals(ref, total_size), + vals2(ref, 1), + expected_float(ref, 1), + expected_double(ref, 1), + dvals(exec), + dcvals(exec), + dvals2(exec) + { + auto maxval = 1e10f; + std::uniform_real_distribution dist(-maxval, maxval); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + cvals.get_data()[i] = {dist(rand), dist(rand)}; + } + dvals = vals; + dcvals = cvals; + gko::uint64 rawdouble{0x4218888000889111ULL}; + gko::uint32 rawfloat{0x50c44400UL}; + gko::uint64 rawrounded{0x4218888000000000ULL}; + std::memcpy(vals2.get_data(), &rawdouble, sizeof(double)); + std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float)); + std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double)); + dvals2 = vals2; + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; + gko::Array vals2; + gko::Array dvals2; + gko::Array expected_float; + gko::Array expected_double; + gko::Array> cvals; + gko::Array> dcvals; +}; + + +TEST_F(PrecisionConversion, ConvertsReal) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = dvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsRealViaRef) +{ + gko::Array tmp{ref}; + gko::Array dout; + + tmp = dvals; + dout = tmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsComplex) +{ + gko::Array> dtmp; + gko::Array> dout; + + dtmp = dcvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dcvals, dout); +} + + +TEST_F(PrecisionConversion, ConversionRounds) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = dvals2; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dtmp, expected_float); + GKO_ASSERT_ARRAY_EQ(dout, expected_double); +} + + +TEST_F(PrecisionConversion, ConvertsRealFromRef) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = vals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsComplexFromRef) +{ + gko::Array> dtmp; + gko::Array> dout; + + dtmp = cvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dcvals, dout); +} + + +} // namespace diff --git a/cuda/test/components/prefix_sum.cpp b/cuda/test/components/prefix_sum.cpp new file mode 100644 index 00000000000..6c3ad82f21e --- /dev/null +++ b/cuda/test/components/prefix_sum.cpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "cuda/test/utils.hpp" + + +namespace { + + +class PrefixSum : public ::testing::Test { +protected: + using index_type = gko::int32; + PrefixSum() + : ref(gko::ReferenceExecutor::create()), + exec(gko::CudaExecutor::create(0, ref)), + rand(293), + total_size(42793), + vals(ref, total_size), + dvals(exec) + { + std::uniform_int_distribution dist(0, 1000); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + } + dvals = vals; + } + + void test(gko::size_type size) + { + gko::kernels::reference::components::prefix_sum(ref, vals.get_data(), + size); + gko::kernels::cuda::components::prefix_sum(exec, dvals.get_data(), + size); + + GKO_ASSERT_ARRAY_EQ(vals, dvals); + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + + +TEST_F(PrefixSum, SmallEqualsReference) { test(100); } + + +TEST_F(PrefixSum, BigEqualsReference) { test(total_size); } + + +} // namespace diff --git a/cuda/test/components/searching_kernels.cu b/cuda/test/components/searching_kernels.cu new file mode 100644 index 00000000000..d4f92099f4a --- /dev/null +++ b/cuda/test/components/searching_kernels.cu @@ -0,0 +1,246 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/components/searching.cuh" + + +#include +#include +#include + + +#include + + +#include +#include + + +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/test/utils.hpp" + + +namespace { + + +using namespace gko::kernels::cuda; +using cooperative_groups::this_thread_block; + + +class Searching : public ::testing::Test { +protected: + Searching() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)), + result(ref, 1), + dresult(cuda), + sizes(14203) + { + std::iota(sizes.begin(), sizes.end(), 0); + } + + template + void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1) + { + *result.get_data() = true; + dresult = result; + kernel<<>>(dresult.get_data(), offset, + size); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + gko::Array result; + gko::Array dresult; + std::vector sizes; +}; + + +__device__ void test_assert(bool *success, bool predicate) +{ + if (!predicate) { + *success = false; + } +} + + +__global__ void test_binary_search(bool *success, int offset, int size) +{ + // test binary search on [offset, offset + size) + // for all possible partition points + auto result = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset; + }); + auto result2 = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset + 1; + }); + test_assert(success, result == threadIdx.x + offset); + test_assert(success, result2 == threadIdx.x + offset + 1); +} + +TEST_F(Searching, BinaryNoOffset) +{ + run_test(test_binary_search, 0, config::warp_size); +} + +TEST_F(Searching, BinaryOffset) +{ + run_test(test_binary_search, 5, config::warp_size); +} + + +__global__ void test_empty_binary_search(bool *success, int offset, int) +{ + auto result = binary_search(offset, 0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == offset); +} + +TEST_F(Searching, BinaryEmptyNoOffset) +{ + run_test(test_empty_binary_search, 0, 0); +} + +TEST_F(Searching, BinaryEmptyOffset) +{ + run_test(test_empty_binary_search, 5, 0); +} + + +__global__ void test_sync_binary_search(bool *success, int, int size) +{ + // test binary search on [0, size) + // for all possible partition points + auto result = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x; + }); + auto result2 = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x + 1; + }); + test_assert(success, result == threadIdx.x); + test_assert(success, result2 == threadIdx.x + 1); +} + +TEST_F(Searching, SyncBinary) +{ + run_test(test_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_empty_sync_binary_search(bool *success, int, int) +{ + auto result = synchronous_binary_search(0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == 0); +} + +TEST_F(Searching, EmptySyncBinary) +{ + run_test(test_empty_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_warp_ary_search(bool *success, int offset, int size) +{ + // test binary search on [offset, offset + size) + // for all possible partition points + auto warp = group::tiled_partition(this_thread_block()); + auto result = group_ary_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpAryNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpAryOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 134, size, size + 1); + } +} + + +__global__ void test_warp_wide_search(bool *success, int offset, int size) +{ + // test binary search on [offset, offset + size) + // for all possible partition points + auto warp = group::tiled_partition(this_thread_block()); + auto result = group_wide_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpWideNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpWideOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 142, size, size + 1); + } +} + + +} // namespace diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu new file mode 100644 index 00000000000..cc50281177b --- /dev/null +++ b/cuda/test/components/sorting_kernels.cu @@ -0,0 +1,144 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/components/sorting.cuh" + + +#include +#include + + +#include + + +#include +#include + + +#include "cuda/test/utils.hpp" + + +namespace { + + +using gko::kernels::cuda::bitonic_sort; +using gko::kernels::cuda::config; + + +constexpr auto num_elements = 2048; +constexpr auto num_local = 4; +constexpr auto num_threads = num_elements / num_local; + + +__global__ void test_sort_shared(gko::int32 *data) +{ + gko::int32 local[num_local]; + __shared__ gko::int32 sh_local[num_elements]; + for (int i = 0; i < num_local; ++i) { + local[i] = data[threadIdx.x * num_local + i]; + } + bitonic_sort(local, sh_local); + for (int i = 0; i < num_local; ++i) { + data[threadIdx.x * num_local + i] = local[i]; + } +} + + +__global__ void test_sort_warp(gko::int32 *data) +{ + gko::int32 local[num_local]; + for (int i = 0; i < num_local; ++i) { + local[i] = data[threadIdx.x * num_local + i]; + } + bitonic_sort( + local, static_cast(nullptr)); + for (int i = 0; i < num_local; ++i) { + data[threadIdx.x * num_local + i] = local[i]; + } +} + + +class Sorting : public ::testing::Test { +protected: + Sorting() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)), + rng(123456), + ref_shared(ref, num_elements), + ref_warp(ref), + ddata(cuda) + { + // we want some duplicate elements + std::uniform_int_distribution dist(0, num_elements / 2); + for (auto i = 0; i < num_elements; ++i) { + ref_shared.get_data()[i] = dist(rng); + } + ddata = gko::Array{cuda, ref_shared}; + ref_warp = ref_shared; + std::sort(ref_shared.get_data(), ref_shared.get_data() + num_elements); + std::sort(ref_warp.get_data(), + ref_warp.get_data() + (config::warp_size * num_local)); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + std::default_random_engine rng; + gko::Array ref_shared; + gko::Array ref_warp; + gko::Array ddata; +}; + + +TEST_F(Sorting, CudaBitonicSortWarp) +{ + test_sort_warp<<<1, config::warp_size>>>(ddata.get_data()); + ddata.set_executor(ref); + auto data_ptr = ddata.get_const_data(); + auto ref_ptr = ref_warp.get_const_data(); + + ASSERT_TRUE(std::equal(data_ptr, data_ptr + (num_local * config::warp_size), + ref_ptr)); +} + + +TEST_F(Sorting, CudaBitonicSortShared) +{ + test_sort_shared<<<1, num_threads>>>(ddata.get_data()); + ddata.set_executor(ref); + auto data_ptr = ddata.get_const_data(); + auto ref_ptr = ref_shared.get_const_data(); + + ASSERT_TRUE(std::equal(data_ptr, data_ptr + num_elements, ref_ptr)); +} + + +} // namespace diff --git a/cuda/test/factorization/CMakeLists.txt b/cuda/test/factorization/CMakeLists.txt index 36c21b93eea..5b494bf99b9 100644 --- a/cuda/test/factorization/CMakeLists.txt +++ b/cuda/test/factorization/CMakeLists.txt @@ -1 +1,4 @@ +ginkgo_create_test(ilu_kernels) +ginkgo_create_test(par_ict_kernels) ginkgo_create_test(par_ilu_kernels) +ginkgo_create_test(par_ilut_kernels) diff --git a/cuda/test/factorization/ilu_kernels.cpp b/cuda/test/factorization/ilu_kernels.cpp new file mode 100644 index 00000000000..4c1d356b0d0 --- /dev/null +++ b/cuda/test/factorization/ilu_kernels.cpp @@ -0,0 +1,121 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include + + +#include + + +#include +#include + + +#include "cuda/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class Ilu : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Csr = gko::matrix::Csr; + + std::shared_ptr ref; + std::shared_ptr cuda; + std::shared_ptr csr_ref; + std::shared_ptr csr_cuda; + + Ilu() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)) + {} + + void SetUp() override + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + csr_ref = gko::read(input_file, ref); + csr_cuda = Csr::create(cuda); + csr_cuda->copy_from(gko::lend(csr_ref)); + } +}; + + +TEST_F(Ilu, ComputeILUIsEquivalentToRef) +{ + auto ref_fact = + gko::factorization::ParIlu<>::build().on(ref)->generate(csr_ref); + auto cuda_fact = + gko::factorization::Ilu<>::build().on(cuda)->generate(csr_cuda); + + GKO_ASSERT_MTX_NEAR(ref_fact->get_l_factor(), cuda_fact->get_l_factor(), + 1e-14); + GKO_ASSERT_MTX_NEAR(ref_fact->get_u_factor(), cuda_fact->get_u_factor(), + 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_l_factor(), + cuda_fact->get_l_factor()); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_u_factor(), + cuda_fact->get_u_factor()); +} + + +TEST_F(Ilu, SetsCorrectStrategy) +{ + auto hip_fact = + gko::factorization::Ilu<>::build() + .with_l_strategy(std::make_shared()) + .with_u_strategy(std::make_shared(cuda)) + .on(cuda) + ->generate(csr_cuda); + + ASSERT_EQ(hip_fact->get_l_factor()->get_strategy()->get_name(), + "merge_path"); + ASSERT_EQ(hip_fact->get_u_factor()->get_strategy()->get_name(), + "load_balance"); +} + + +} // namespace diff --git a/cuda/test/factorization/par_ict_kernels.cpp b/cuda/test/factorization/par_ict_kernels.cpp new file mode 100644 index 00000000000..f052ac4bc85 --- /dev/null +++ b/cuda/test/factorization/par_ict_kernels.cpp @@ -0,0 +1,177 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "cuda/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class ParIct : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + + ParIct() + : mtx_size(436, 436), + rand_engine(45856), + ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)) + { + mtx = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution<>(10, mtx_size[1]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + + dmtx_ani = Csr::create(cuda); + dmtx_l_ani = Csr::create(cuda); + dmtx = Csr::create(cuda); + dmtx->copy_from(lend(mtx)); + dmtx_l = Csr::create(cuda); + dmtx_l->copy_from(lend(mtx_l)); + } + + void SetUp() + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + mtx_ani = gko::read(input_file, ref); + mtx_ani->sort_by_column_index(); + + { + mtx_l_ani = Csr::create(ref, mtx_ani->get_size()); + gko::matrix::CsrBuilder l_builder( + lend(mtx_l_ani)); + gko::kernels::reference::factorization::initialize_row_ptrs_l( + ref, lend(mtx_ani), mtx_l_ani->get_row_ptrs()); + auto l_nnz = + mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + gko::kernels::reference::factorization::initialize_l( + ref, lend(mtx_ani), lend(mtx_l_ani), true); + } + dmtx_ani->copy_from(lend(mtx_ani)); + dmtx_l_ani->copy_from(lend(mtx_l_ani)); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + + const gko::dim<2> mtx_size; + std::default_random_engine rand_engine; + + std::unique_ptr mtx; + std::unique_ptr mtx_ani; + std::unique_ptr mtx_l_ani; + std::unique_ptr mtx_l; + + std::unique_ptr dmtx; + std::unique_ptr dmtx_ani; + std::unique_ptr dmtx_l_ani; + std::unique_ptr dmtx_l; +}; + + +TEST_F(ParIct, KernelAddCandidatesIsEquivalentToRef) +{ + auto mtx_llt = Csr::create(ref, mtx_size); + mtx_l->apply(lend(mtx_l->transpose()), lend(mtx_llt)); + auto dmtx_llt = Csr::create(cuda, mtx_size); + dmtx_llt->copy_from(lend(mtx_llt)); + auto res_mtx_l = Csr::create(ref, mtx_size); + auto dres_mtx_l = Csr::create(cuda, mtx_size); + + gko::kernels::reference::par_ict_factorization::add_candidates( + ref, lend(mtx_llt), lend(mtx), lend(mtx_l), lend(res_mtx_l)); + gko::kernels::cuda::par_ict_factorization::add_candidates( + cuda, lend(dmtx_llt), lend(dmtx), lend(dmtx_l), lend(dres_mtx_l)); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l); + GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14); +} + + +TEST_F(ParIct, KernelComputeFactorIsEquivalentToRef) +{ + auto square_size = mtx_ani->get_size(); + auto mtx_l_coo = Coo::create(ref, square_size); + mtx_l_ani->convert_to(lend(mtx_l_coo)); + auto dmtx_l_coo = Coo::create(cuda, square_size); + dmtx_l_coo->copy_from(lend(mtx_l_coo)); + + gko::kernels::reference::par_ict_factorization::compute_factor( + ref, lend(mtx_ani), lend(mtx_l_ani), lend(mtx_l_coo)); + for (int i = 0; i < 20; ++i) { + gko::kernels::cuda::par_ict_factorization::compute_factor( + cuda, lend(dmtx_ani), lend(dmtx_l_ani), lend(dmtx_l_coo)); + } + + GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2); +} + + +} // namespace diff --git a/cuda/test/factorization/par_ilu_kernels.cpp b/cuda/test/factorization/par_ilu_kernels.cpp index ee28409223c..f3ae4150924 100644 --- a/cuda/test/factorization/par_ilu_kernels.cpp +++ b/cuda/test/factorization/par_ilu_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -49,7 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils.hpp" +#include "core/factorization/factorization_kernels.hpp" +#include "cuda/test/utils.hpp" #include "matrices/config.hpp" @@ -64,8 +66,15 @@ class ParIlu : public ::testing::Test { using Coo = gko::matrix::Coo; using Csr = gko::matrix::Csr; + std::ranlux48 rand_engine; + std::shared_ptr ref; + std::shared_ptr cuda; + std::shared_ptr csr_ref; + std::shared_ptr csr_cuda; + ParIlu() - : ref(gko::ReferenceExecutor::create()), + : rand_engine(18), + ref(gko::ReferenceExecutor::create()), cuda(gko::CudaExecutor::create(0, ref)), csr_ref(nullptr), csr_cuda(nullptr) @@ -79,25 +88,61 @@ class ParIlu : public ::testing::Test { FAIL() << "Could not find the file \"" << file_name << "\", which is required for this test.\n"; } - csr_ref = gko::read(input_file, ref); + auto csr_ref_temp = gko::read(input_file, ref); auto csr_cuda_temp = Csr::create(cuda); - csr_cuda_temp->copy_from(gko::lend(csr_ref)); + csr_cuda_temp->copy_from(gko::lend(csr_ref_temp)); + // Make sure there are diagonal elements present + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(csr_ref_temp), false); + gko::kernels::cuda::factorization::add_diagonal_elements( + cuda, gko::lend(csr_cuda_temp), false); + csr_ref = gko::give(csr_ref_temp); csr_cuda = gko::give(csr_cuda_temp); } - std::shared_ptr ref; - std::shared_ptr cuda; - std::shared_ptr csr_ref; - std::shared_ptr csr_cuda; + template + std::unique_ptr gen_mtx(index_type num_rows, index_type num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution(0, num_cols - 1), + std::normal_distribution(0.0, 1.0), rand_engine, ref); + } + + std::unique_ptr gen_unsorted_mtx(index_type num_rows, + index_type num_cols) + { + using std::swap; + auto mtx = gen_mtx(num_rows, num_cols); + auto values = mtx->get_values(); + auto col_idxs = mtx->get_col_idxs(); + const auto row_ptrs = mtx->get_const_row_ptrs(); + for (int row = 0; row < num_rows; ++row) { + const auto row_start = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const int num_row_elements = row_end - row_start; + auto idx_dist = std::uniform_int_distribution( + row_start, row_end - 1); + for (int i = 0; i < num_row_elements / 2; ++i) { + auto idx1 = idx_dist(rand_engine); + auto idx2 = idx_dist(rand_engine); + if (idx1 != idx2) { + swap(values[idx1], values[idx2]); + swap(col_idxs[idx1], col_idxs[idx2]); + } + } + } + return mtx; + } void initialize_row_ptrs(index_type *l_row_ptrs_ref, index_type *u_row_ptrs_ref, index_type *l_row_ptrs_cuda, index_type *u_row_ptrs_cuda) { - gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u( + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( ref, gko::lend(csr_ref), l_row_ptrs_ref, u_row_ptrs_ref); - gko::kernels::cuda::par_ilu_factorization::initialize_row_ptrs_l_u( + gko::kernels::cuda::factorization::initialize_row_ptrs_l_u( cuda, gko::lend(csr_cuda), l_row_ptrs_cuda, u_row_ptrs_cuda); } @@ -124,18 +169,18 @@ class ParIlu : public ::testing::Test { *l_cuda = Csr::create(cuda, csr_cuda->get_size(), l_nnz); *u_cuda = Csr::create(cuda, csr_cuda->get_size(), u_nnz); // Copy the already initialized `row_ptrs` to the new matrices - ref->copy_from(gko::lend(ref), num_row_ptrs, l_row_ptrs_ref.get_data(), - (*l_ref)->get_row_ptrs()); - ref->copy_from(gko::lend(ref), num_row_ptrs, u_row_ptrs_ref.get_data(), - (*u_ref)->get_row_ptrs()); - cuda->copy_from(gko::lend(cuda), num_row_ptrs, - l_row_ptrs_cuda.get_data(), (*l_cuda)->get_row_ptrs()); - cuda->copy_from(gko::lend(cuda), num_row_ptrs, - u_row_ptrs_cuda.get_data(), (*u_cuda)->get_row_ptrs()); - - gko::kernels::reference::par_ilu_factorization::initialize_l_u( + ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(), + (*l_ref)->get_row_ptrs()); + ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(), + (*u_ref)->get_row_ptrs()); + cuda->copy(num_row_ptrs, l_row_ptrs_cuda.get_data(), + (*l_cuda)->get_row_ptrs()); + cuda->copy(num_row_ptrs, u_row_ptrs_cuda.get_data(), + (*u_cuda)->get_row_ptrs()); + + gko::kernels::reference::factorization::initialize_l_u( ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref)); - gko::kernels::cuda::par_ilu_factorization::initialize_l_u( + gko::kernels::cuda::factorization::initialize_l_u( cuda, gko::lend(csr_cuda), gko::lend(*l_cuda), gko::lend(*u_cuda)); } @@ -176,6 +221,63 @@ class ParIlu : public ::testing::Test { }; +TEST_F(ParIlu, CudaKernelAddDiagonalElementsSortedEquivalentToRef) +{ + index_type num_rows{600}; + index_type num_cols{600}; + auto mtx_ref = gen_mtx(num_rows, num_cols); + auto mtx_cuda = Csr::create(cuda); + mtx_cuda->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(mtx_ref), true); + gko::kernels::cuda::factorization::add_diagonal_elements( + cuda, gko::lend(mtx_cuda), true); + + ASSERT_TRUE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_cuda, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_cuda); +} + + +TEST_F(ParIlu, CudaKernelAddDiagonalElementsUnsortedEquivalentToRef) +{ + index_type num_rows{600}; + index_type num_cols{600}; + auto mtx_ref = gen_unsorted_mtx(num_rows, num_cols); + auto mtx_cuda = Csr::create(cuda); + mtx_cuda->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(mtx_ref), false); + gko::kernels::cuda::factorization::add_diagonal_elements( + cuda, gko::lend(mtx_cuda), false); + + ASSERT_FALSE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_cuda, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_cuda); +} + + +TEST_F(ParIlu, CudaKernelAddDiagonalElementsNonSquareEquivalentToRef) +{ + index_type num_rows{600}; + index_type num_cols{500}; + auto mtx_ref = gen_mtx(num_rows, num_cols); + auto mtx_cuda = Csr::create(cuda); + mtx_cuda->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(mtx_ref), true); + gko::kernels::cuda::factorization::add_diagonal_elements( + cuda, gko::lend(mtx_cuda), true); + + ASSERT_TRUE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_cuda, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_cuda); +} + + TEST_F(ParIlu, KernelInitializeRowPtrsLUEquivalentToRef) { auto num_row_ptrs = csr_ref->get_size()[0] + 1; @@ -188,8 +290,8 @@ TEST_F(ParIlu, KernelInitializeRowPtrsLUEquivalentToRef) l_row_ptrs_array_ref.get_data(), u_row_ptrs_array_ref.get_data(), l_row_ptrs_array_cuda.get_data(), u_row_ptrs_array_cuda.get_data()); - GKO_ASSERT_ARRAY_EQ(&l_row_ptrs_array_ref, &l_row_ptrs_array_cuda); - GKO_ASSERT_ARRAY_EQ(&u_row_ptrs_array_ref, &u_row_ptrs_array_cuda); + GKO_ASSERT_ARRAY_EQ(l_row_ptrs_array_ref, l_row_ptrs_array_cuda); + GKO_ASSERT_ARRAY_EQ(u_row_ptrs_array_ref, u_row_ptrs_array_cuda); } @@ -204,6 +306,8 @@ TEST_F(ParIlu, KernelInitializeParILUIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(l_ref, l_cuda, 1e-14); GKO_ASSERT_MTX_NEAR(u_ref, u_cuda, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_cuda); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_cuda); } @@ -218,6 +322,8 @@ TEST_F(ParIlu, KernelComputeParILUIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(l_ref, l_cuda, 5e-2); GKO_ASSERT_MTX_NEAR(u_ref, u_cuda, 5e-2); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_cuda); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_cuda); } @@ -233,6 +339,8 @@ TEST_F(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(l_ref, l_cuda, 1e-14); GKO_ASSERT_MTX_NEAR(u_ref, u_cuda, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_cuda); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_cuda); } diff --git a/cuda/test/factorization/par_ilut_kernels.cpp b/cuda/test/factorization/par_ilut_kernels.cpp new file mode 100644 index 00000000000..7a66ffe2ec0 --- /dev/null +++ b/cuda/test/factorization/par_ilut_kernels.cpp @@ -0,0 +1,541 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "cuda/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class ParIlut : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Dense = gko::matrix::Dense; + using ComplexDense = gko::matrix::Dense>; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + using ComplexCsr = gko::matrix::Csr, index_type>; + + ParIlut() + : mtx_size(532, 423), + rand_engine(1337), + ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)) + { + mtx1 = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution<>(10, mtx_size[1]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx2 = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution<>(0, mtx_size[1]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_square = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[0], + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l2 = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], true, + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l_complex = + gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_u = gko::test::generate_random_upper_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_u_complex = + gko::test::generate_random_upper_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + + dmtx1 = Csr::create(cuda); + dmtx1->copy_from(mtx1.get()); + dmtx2 = Csr::create(cuda); + dmtx2->copy_from(mtx2.get()); + dmtx_square = Csr::create(cuda); + dmtx_square->copy_from(mtx_square.get()); + dmtx_ani = Csr::create(cuda); + dmtx_l_ani = Csr::create(cuda); + dmtx_u_ani = Csr::create(cuda); + dmtx_ut_ani = Csr::create(cuda); + dmtx_l = Csr::create(cuda); + dmtx_l->copy_from(mtx_l.get()); + dmtx_l2 = Csr::create(cuda); + dmtx_l2->copy_from(mtx_l2.get()); + dmtx_u = Csr::create(cuda); + dmtx_u->copy_from(mtx_u.get()); + dmtx_l_complex = ComplexCsr::create(cuda); + dmtx_l_complex->copy_from(mtx_l_complex.get()); + dmtx_u_complex = ComplexCsr::create(cuda); + dmtx_u_complex->copy_from(mtx_u_complex.get()); + } + + void SetUp() + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + mtx_ani = gko::read(input_file, ref); + mtx_ani->sort_by_column_index(); + + { + mtx_l_ani = Csr::create(ref, mtx_ani->get_size()); + mtx_u_ani = Csr::create(ref, mtx_ani->get_size()); + gko::matrix::CsrBuilder l_builder( + mtx_l_ani.get()); + gko::matrix::CsrBuilder u_builder( + mtx_u_ani.get()); + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( + ref, mtx_ani.get(), mtx_l_ani->get_row_ptrs(), + mtx_u_ani->get_row_ptrs()); + auto l_nnz = + mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + auto u_nnz = + mtx_u_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + u_builder.get_col_idx_array().resize_and_reset(u_nnz); + u_builder.get_value_array().resize_and_reset(u_nnz); + gko::kernels::reference::factorization::initialize_l_u( + ref, mtx_ani.get(), mtx_l_ani.get(), mtx_u_ani.get()); + mtx_ut_ani = Csr::create(ref, mtx_ani->get_size(), + mtx_u_ani->get_num_stored_elements()); + gko::kernels::reference::csr::transpose(ref, mtx_u_ani.get(), + mtx_ut_ani.get()); + } + dmtx_ani->copy_from(mtx_ani.get()); + dmtx_l_ani->copy_from(mtx_l_ani.get()); + dmtx_u_ani->copy_from(mtx_u_ani.get()); + dmtx_ut_ani->copy_from(mtx_ut_ani.get()); + } + + template + void test_select(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, index_type rank, + value_type tolerance = 0.0) + { + auto size = index_type(mtx->get_num_stored_elements()); + using ValueType = typename Mtx::value_type; + + gko::remove_complex res{}; + gko::remove_complex dres{}; + gko::Array tmp(ref); + gko::Array> tmp2(ref); + gko::Array dtmp(cuda); + gko::Array> dtmp2(cuda); + + gko::kernels::reference::par_ilut_factorization::threshold_select( + ref, mtx.get(), rank, tmp, tmp2, res); + gko::kernels::cuda::par_ilut_factorization::threshold_select( + cuda, dmtx.get(), rank, dtmp, dtmp2, dres); + + ASSERT_NEAR(res, dres, tolerance); + } + + template > + void test_filter(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, value_type threshold, + bool lower) + { + auto res = Mtx::create(ref, mtx_size); + auto dres = Mtx::create(cuda, mtx_size); + auto res_coo = Coo::create(ref, mtx_size); + auto dres_coo = Coo::create(cuda, mtx_size); + auto local_mtx = gko::as(lower ? mtx->clone() : mtx->transpose()); + auto local_dmtx = + gko::as(lower ? dmtx->clone() : dmtx->transpose()); + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower); + gko::kernels::cuda::par_ilut_factorization::threshold_filter( + cuda, local_dmtx.get(), threshold, dres.get(), dres_coo.get(), + lower); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + GKO_ASSERT_MTX_NEAR(res, res_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo); + GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo); + } + + template > + void test_filter_approx(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, index_type rank, + value_type tolerance = 0.0) + { + auto res = Mtx::create(ref, mtx_size); + auto dres = Mtx::create(cuda, mtx_size); + auto res_coo = Coo::create(ref, mtx_size); + auto dres_coo = Coo::create(cuda, mtx_size); + using ValueType = typename Mtx::value_type; + + gko::Array tmp(ref); + gko::Array dtmp(cuda); + gko::remove_complex threshold{}; + gko::remove_complex dthreshold{}; + + gko::kernels::reference::par_ilut_factorization:: + threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold, + res.get(), res_coo.get()); + gko::kernels::cuda::par_ilut_factorization::threshold_filter_approx( + cuda, dmtx.get(), rank, dtmp, dthreshold, dres.get(), + dres_coo.get()); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + GKO_ASSERT_MTX_NEAR(res, res_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo); + GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo); + ASSERT_NEAR(threshold, dthreshold, tolerance); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + + const gko::dim<2> mtx_size; + std::default_random_engine rand_engine; + + std::unique_ptr mtx1; + std::unique_ptr mtx2; + std::unique_ptr mtx_square; + std::unique_ptr mtx_ani; + std::unique_ptr mtx_l_ani; + std::unique_ptr mtx_u_ani; + std::unique_ptr mtx_ut_ani; + std::unique_ptr mtx_l; + std::unique_ptr mtx_l2; + std::unique_ptr mtx_l_complex; + std::unique_ptr mtx_u; + std::unique_ptr mtx_u_complex; + + std::unique_ptr dmtx1; + std::unique_ptr dmtx2; + std::unique_ptr dmtx_square; + std::unique_ptr dmtx_ani; + std::unique_ptr dmtx_l_ani; + std::unique_ptr dmtx_u_ani; + std::unique_ptr dmtx_ut_ani; + std::unique_ptr dmtx_l; + std::unique_ptr dmtx_l2; + std::unique_ptr dmtx_l_complex; + std::unique_ptr dmtx_u; + std::unique_ptr dmtx_u_complex; +}; + + +TEST_F(ParIlut, KernelThresholdSelectIsEquivalentToRef) +{ + test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 3); +} + + +TEST_F(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) +{ + test_select(mtx_l, dmtx_l, 0); +} + + +TEST_F(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef) +{ + test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1); +} + + +TEST_F(ParIlut, KernelComplexThresholdSelectIsEquivalentToRef) +{ + test_select(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() / 3, 1e-14); +} + + +TEST_F(ParIlut, KernelComplexThresholdSelectMinIsEquivalentToRef) +{ + test_select(mtx_l_complex, dmtx_l_complex, 0, 1e-14); +} + + +TEST_F(ParIlut, KernelComplexThresholdSelectMaxLowerIsEquivalentToRef) +{ + test_select(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() - 1, 1e-14); +} + + +TEST_F(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef) +{ + auto res = Csr::create(ref, mtx_size); + auto dres = Csr::create(cuda, mtx_size); + Coo *null_coo = nullptr; + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, mtx_l.get(), 0.5, res.get(), null_coo, true); + gko::kernels::cuda::par_ilut_factorization::threshold_filter( + cuda, dmtx_l.get(), 0.5, dres.get(), null_coo, true); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); +} + + +TEST_F(ParIlut, KernelThresholdFilterLowerIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0.5, true); +} + + +TEST_F(ParIlut, KernelThresholdFilterUpperIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0.5, false); +} + + +TEST_F(ParIlut, KernelThresholdFilterNoneLowerIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0, true); +} + + +TEST_F(ParIlut, KernelThresholdFilterNoneUpperIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0, false); +} + + +TEST_F(ParIlut, KernelThresholdFilterAllLowerIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 1e6, true); +} + + +TEST_F(ParIlut, KernelThresholdFilterAllUpperIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 1e6, false); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterLowerIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0.5, true); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterUpperIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0.5, false); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterNoneLowerIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0, true); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterNoneUpperIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0, false); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterAllLowerIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 1e6, true); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterAllUppererIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 1e6, false); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0.5, true); + auto res = Csr::create(ref, mtx_size); + auto dres = Csr::create(cuda, mtx_size); + Coo *null_coo = nullptr; + gko::Array tmp(ref); + gko::Array dtmp(cuda); + gko::remove_complex threshold{}; + gko::remove_complex dthreshold{}; + index_type rank{}; + + gko::kernels::reference::par_ilut_factorization::threshold_filter_approx( + ref, mtx_l.get(), rank, tmp, threshold, res.get(), null_coo); + gko::kernels::cuda::par_ilut_factorization::threshold_filter_approx( + cuda, dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), null_coo); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + ASSERT_EQ(threshold, dthreshold); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 2); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l, dmtx_l, 0); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterApproxLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() / 2, + r::value); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterApproxNoneLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l_complex, dmtx_l_complex, 0, r::value); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterApproxAllLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() - 1, + r::value); +} + + +TEST_F(ParIlut, KernelAddCandidatesIsEquivalentToRef) +{ + auto square_size = mtx_square->get_size(); + auto mtx_lu = Csr::create(ref, square_size); + mtx_l2->apply(mtx_u.get(), mtx_lu.get()); + auto dmtx_lu = Csr::create(cuda, square_size); + dmtx_lu->copy_from(mtx_lu.get()); + auto res_mtx_l = Csr::create(ref, square_size); + auto res_mtx_u = Csr::create(ref, square_size); + auto dres_mtx_l = Csr::create(cuda, square_size); + auto dres_mtx_u = Csr::create(cuda, square_size); + + gko::kernels::reference::par_ilut_factorization::add_candidates( + ref, mtx_lu.get(), mtx_square.get(), mtx_l2.get(), mtx_u.get(), + res_mtx_l.get(), res_mtx_u.get()); + gko::kernels::cuda::par_ilut_factorization::add_candidates( + cuda, dmtx_lu.get(), dmtx_square.get(), dmtx_l2.get(), dmtx_u.get(), + dres_mtx_l.get(), dres_mtx_u.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, dres_mtx_u); + GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14); + GKO_ASSERT_MTX_NEAR(res_mtx_u, dres_mtx_u, 1e-14); +} + + +TEST_F(ParIlut, KernelComputeLUIsEquivalentToRef) +{ + auto square_size = mtx_ani->get_size(); + auto mtx_l_coo = Coo::create(ref, square_size); + auto mtx_u_coo = Coo::create(ref, square_size); + mtx_l_ani->convert_to(mtx_l_coo.get()); + mtx_u_ani->convert_to(mtx_u_coo.get()); + auto dmtx_l_coo = Coo::create(cuda, square_size); + auto dmtx_u_coo = Coo::create(cuda, square_size); + dmtx_l_coo->copy_from(mtx_l_coo.get()); + dmtx_u_coo->copy_from(mtx_u_coo.get()); + + gko::kernels::reference::par_ilut_factorization::compute_l_u_factors( + ref, mtx_ani.get(), mtx_l_ani.get(), mtx_l_coo.get(), mtx_u_ani.get(), + mtx_u_coo.get(), mtx_ut_ani.get()); + for (int i = 0; i < 20; ++i) { + gko::kernels::cuda::par_ilut_factorization::compute_l_u_factors( + cuda, dmtx_ani.get(), dmtx_l_ani.get(), dmtx_l_coo.get(), + dmtx_u_ani.get(), dmtx_u_coo.get(), dmtx_ut_ani.get()); + } + auto dmtx_utt_ani = gko::as(dmtx_ut_ani->transpose()); + + GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2); + GKO_ASSERT_MTX_NEAR(mtx_u_ani, dmtx_u_ani, 1e-2); + GKO_ASSERT_MTX_NEAR(dmtx_u_ani, dmtx_utt_ani, 0); +} + + +} // namespace diff --git a/cuda/test/matrix/coo_kernels.cpp b/cuda/test/matrix/coo_kernels.cpp index d1974a64530..7d6051d9f63 100644 --- a/cuda/test/matrix/coo_kernels.cpp +++ b/cuda/test/matrix/coo_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/coo_kernels.hpp" +#include #include @@ -42,12 +42,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include -#include "core/test/utils.hpp" +#include "core/matrix/coo_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index fd08a070cdb..39608505cbd 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/csr_kernels.hpp" +#include #include @@ -42,15 +42,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include +#include #include #include -#include "core/test/utils.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { @@ -63,7 +64,7 @@ class Csr : public ::testing::Test { using ComplexVec = gko::matrix::Dense>; using ComplexMtx = gko::matrix::Csr>; - Csr() : rand_engine(42) {} + Csr() : mtx_size(532, 231), rand_engine(42) {} void SetUp() { @@ -93,13 +94,17 @@ class Csr : public ::testing::Test { int num_vectors = 1) { mtx = Mtx::create(ref, strategy); - mtx->copy_from(gen_mtx(532, 231, 1)); - expected = gen_mtx(532, num_vectors, 1); - y = gen_mtx(231, num_vectors, 1); + mtx->copy_from(gen_mtx(mtx_size[0], mtx_size[1], 1)); + square_mtx = Mtx::create(ref, strategy); + square_mtx->copy_from(gen_mtx(mtx_size[0], mtx_size[0], 1)); + expected = gen_mtx(mtx_size[0], num_vectors, 1); + y = gen_mtx(mtx_size[1], num_vectors, 1); alpha = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); dmtx = Mtx::create(cuda, strategy); dmtx->copy_from(mtx.get()); + square_dmtx = Mtx::create(cuda, strategy); + square_dmtx->copy_from(square_mtx.get()); dresult = Vec::create(cuda); dresult->copy_from(expected.get()); dy = Vec::create(cuda); @@ -114,18 +119,53 @@ class Csr : public ::testing::Test { std::shared_ptr strategy) { complex_mtx = ComplexMtx::create(ref, strategy); - complex_mtx->copy_from(gen_mtx(532, 231, 1)); + complex_mtx->copy_from( + gen_mtx(mtx_size[0], mtx_size[1], 1)); complex_dmtx = ComplexMtx::create(cuda, strategy); complex_dmtx->copy_from(complex_mtx.get()); } + struct matrix_pair { + std::unique_ptr ref; + std::unique_ptr cuda; + }; + + matrix_pair gen_unsorted_mtx() + { + constexpr int min_nnz_per_row = 2; // Must be at least 2 + auto local_mtx_ref = + gen_mtx(mtx_size[0], mtx_size[1], min_nnz_per_row); + for (size_t row = 0; row < mtx_size[0]; ++row) { + const auto row_ptrs = local_mtx_ref->get_const_row_ptrs(); + const auto start_row = row_ptrs[row]; + auto col_idx = local_mtx_ref->get_col_idxs() + start_row; + auto vals = local_mtx_ref->get_values() + start_row; + const auto nnz_in_this_row = row_ptrs[row + 1] - row_ptrs[row]; + auto swap_idx_dist = + std::uniform_int_distribution<>(0, nnz_in_this_row - 1); + // shuffle `nnz_in_this_row / 2` times + for (size_t perm = 0; perm < nnz_in_this_row; perm += 2) { + const auto idx1 = swap_idx_dist(rand_engine); + const auto idx2 = swap_idx_dist(rand_engine); + std::swap(col_idx[idx1], col_idx[idx2]); + std::swap(vals[idx1], vals[idx2]); + } + } + auto local_mtx_cuda = Mtx::create(cuda); + local_mtx_cuda->copy_from(local_mtx_ref.get()); + + return {std::move(local_mtx_ref), std::move(local_mtx_cuda)}; + } + std::shared_ptr ref; std::shared_ptr cuda; + const gko::dim<2> mtx_size; std::ranlux48 rand_engine; std::unique_ptr mtx; std::unique_ptr complex_mtx; + std::unique_ptr square_mtx; std::unique_ptr expected; std::unique_ptr y; std::unique_ptr alpha; @@ -133,6 +173,7 @@ class Csr : public ::testing::Test { std::unique_ptr dmtx; std::unique_ptr complex_dmtx; + std::unique_ptr square_dmtx; std::unique_ptr dresult; std::unique_ptr dy; std::unique_ptr dalpha; @@ -142,7 +183,7 @@ class Csr : public ::testing::Test { TEST_F(Csr, StrategyAfterCopyIsEquivalentToRef) { - set_up_apply_data(std::make_shared(32)); + set_up_apply_data(std::make_shared(cuda)); ASSERT_EQ(mtx->get_strategy()->get_name(), dmtx->get_strategy()->get_name()); @@ -151,7 +192,7 @@ TEST_F(Csr, StrategyAfterCopyIsEquivalentToRef) TEST_F(Csr, SimpleApplyIsEquivalentToRefWithLoadBalance) { - set_up_apply_data(std::make_shared(32)); + set_up_apply_data(std::make_shared(cuda)); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -162,7 +203,7 @@ TEST_F(Csr, SimpleApplyIsEquivalentToRefWithLoadBalance) TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithLoadBalance) { - set_up_apply_data(std::make_shared(32)); + set_up_apply_data(std::make_shared(cuda)); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -173,7 +214,7 @@ TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithLoadBalance) TEST_F(Csr, SimpleApplyIsEquivalentToRefWithCusparse) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -184,7 +225,7 @@ TEST_F(Csr, SimpleApplyIsEquivalentToRefWithCusparse) TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithCusparse) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -239,7 +280,7 @@ TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithClassical) TEST_F(Csr, SimpleApplyIsEquivalentToRefWithAutomatical) { - set_up_apply_data(std::make_shared(32)); + set_up_apply_data(std::make_shared(cuda)); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -250,7 +291,7 @@ TEST_F(Csr, SimpleApplyIsEquivalentToRefWithAutomatical) TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithLoadBalance) { - set_up_apply_data(std::make_shared(32), 3); + set_up_apply_data(std::make_shared(cuda), 3); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -261,7 +302,7 @@ TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithLoadBalance) TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithLoadBalance) { - set_up_apply_data(std::make_shared(32), 3); + set_up_apply_data(std::make_shared(cuda), 3); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -314,9 +355,61 @@ TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath) } +TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + mtx->apply(alpha.get(), trans.get(), beta.get(), square_mtx.get()); + dmtx->apply(dalpha.get(), d_trans.get(), dbeta.get(), square_dmtx.get()); + + GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); + ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + mtx->apply(trans.get(), square_mtx.get()); + dmtx->apply(d_trans.get(), square_dmtx.get()); + + GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); + ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto a = gen_mtx(mtx_size[0], mtx_size[1], 0); + auto b = gen_mtx(mtx_size[0], mtx_size[1], 0); + auto da = Mtx::create(cuda); + auto db = Mtx::create(cuda); + da->copy_from(a.get()); + db->copy_from(b.get()); + auto id = gko::matrix::Identity::create(ref, mtx_size[1]); + auto did = + gko::matrix::Identity::create(cuda, mtx_size[1]); + + a->apply(alpha.get(), id.get(), beta.get(), b.get()); + da->apply(dalpha.get(), did.get(), dbeta.get(), db.get()); + + GKO_ASSERT_MTX_NEAR(b, db, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(b, db); + ASSERT_TRUE(db->is_sorted_by_column_index()); +} + + TEST_F(Csr, TransposeIsEquivalentToRef) { - set_up_apply_data(std::make_shared(32)); + set_up_apply_data(std::make_shared(cuda)); auto trans = mtx->transpose(); auto d_trans = dmtx->transpose(); @@ -328,7 +421,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef) TEST_F(Csr, ConjugateTransposeIsEquivalentToRef) { - set_up_apply_complex_data(std::make_shared(32)); + set_up_apply_complex_data(std::make_shared(cuda)); auto trans = complex_mtx->conj_transpose(); auto d_trans = complex_dmtx->conj_transpose(); @@ -340,7 +433,7 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef) TEST_F(Csr, ConvertToDenseIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto dense_mtx = gko::matrix::Dense<>::create(ref); auto ddense_mtx = gko::matrix::Dense<>::create(cuda); @@ -353,7 +446,7 @@ TEST_F(Csr, ConvertToDenseIsEquivalentToRef) TEST_F(Csr, MoveToDenseIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto dense_mtx = gko::matrix::Dense<>::create(ref); auto ddense_mtx = gko::matrix::Dense<>::create(cuda); @@ -366,7 +459,7 @@ TEST_F(Csr, MoveToDenseIsEquivalentToRef) TEST_F(Csr, ConvertToEllIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto ell_mtx = gko::matrix::Ell<>::create(ref); auto dell_mtx = gko::matrix::Ell<>::create(cuda); @@ -379,7 +472,7 @@ TEST_F(Csr, ConvertToEllIsEquivalentToRef) TEST_F(Csr, MoveToEllIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto ell_mtx = gko::matrix::Ell<>::create(ref); auto dell_mtx = gko::matrix::Ell<>::create(cuda); @@ -389,9 +482,10 @@ TEST_F(Csr, MoveToEllIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); } + TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda); @@ -404,7 +498,7 @@ TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda); @@ -417,7 +511,7 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); gko::size_type max_nnz_per_row; gko::size_type dmax_nnz_per_row; @@ -432,7 +526,7 @@ TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) TEST_F(Csr, ConvertToCooIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto coo_mtx = gko::matrix::Coo<>::create(ref); auto dcoo_mtx = gko::matrix::Coo<>::create(cuda); @@ -445,7 +539,7 @@ TEST_F(Csr, ConvertToCooIsEquivalentToRef) TEST_F(Csr, MoveToCooIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto coo_mtx = gko::matrix::Coo<>::create(ref); auto dcoo_mtx = gko::matrix::Coo<>::create(cuda); @@ -458,7 +552,7 @@ TEST_F(Csr, MoveToCooIsEquivalentToRef) TEST_F(Csr, ConvertToSellpIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto sellp_mtx = gko::matrix::Sellp<>::create(ref); auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda); @@ -471,7 +565,7 @@ TEST_F(Csr, ConvertToSellpIsEquivalentToRef) TEST_F(Csr, MoveToSellpIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto sellp_mtx = gko::matrix::Sellp<>::create(ref); auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda); @@ -482,9 +576,21 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef) } +TEST_F(Csr, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(cuda); + auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(cuda->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); gko::size_type total_cols; gko::size_type dtotal_cols; @@ -499,7 +605,7 @@ TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) TEST_F(Csr, CalculatesNonzerosPerRow) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); gko::Array row_nnz(ref, mtx->get_size()[0]); gko::Array drow_nnz(cuda, dmtx->get_size()[0]); @@ -508,14 +614,14 @@ TEST_F(Csr, CalculatesNonzerosPerRow) gko::kernels::cuda::csr::calculate_nonzeros_per_row(cuda, dmtx.get(), &drow_nnz); - GKO_ASSERT_ARRAY_EQ(&row_nnz, &drow_nnz); + GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } TEST_F(Csr, ConvertToHybridIsEquivalentToRef) { using Hybrid_type = gko::matrix::Hybrid<>; - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto hybrid_mtx = Hybrid_type::create( ref, std::make_shared(2)); auto dhybrid_mtx = Hybrid_type::create( @@ -531,7 +637,7 @@ TEST_F(Csr, ConvertToHybridIsEquivalentToRef) TEST_F(Csr, MoveToHybridIsEquivalentToRef) { using Hybrid_type = gko::matrix::Hybrid<>; - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared()); auto hybrid_mtx = Hybrid_type::create( ref, std::make_shared(2)); auto dhybrid_mtx = Hybrid_type::create( @@ -544,4 +650,79 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) } +TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + bool is_sorted_cuda{}; + bool is_sorted_ref{}; + + is_sorted_ref = mtx->is_sorted_by_column_index(); + is_sorted_cuda = dmtx->is_sorted_by_column_index(); + + ASSERT_EQ(is_sorted_ref, is_sorted_cuda); +} + + +TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef) +{ + auto uns_mtx = gen_unsorted_mtx(); + bool is_sorted_cuda{}; + bool is_sorted_ref{}; + + is_sorted_ref = uns_mtx.ref->is_sorted_by_column_index(); + is_sorted_cuda = uns_mtx.cuda->is_sorted_by_column_index(); + + ASSERT_EQ(is_sorted_ref, is_sorted_cuda); +} + + +TEST_F(Csr, SortSortedMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + + mtx->sort_by_column_index(); + dmtx->sort_by_column_index(); + + // Values must be unchanged, therefore, tolerance is `0` + GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0); +} + + +TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef) +{ + auto uns_mtx = gen_unsorted_mtx(); + + uns_mtx.ref->sort_by_column_index(); + uns_mtx.cuda->sort_by_column_index(); + + // Values must be unchanged, therefore, tolerance is `0` + GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.cuda, 0); +} + + +TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices) +{ + auto automatical = std::make_shared(); + auto row_len_limit = std::max(automatical->nvidia_row_len_limit, + automatical->amd_row_len_limit); + auto load_balance_mtx = Mtx::create(ref); + auto classical_mtx = Mtx::create(ref); + load_balance_mtx->copy_from( + gen_mtx(1, row_len_limit + 1000, row_len_limit + 1)); + classical_mtx->copy_from(gen_mtx(50, 50, 1)); + auto load_balance_mtx_d = Mtx::create(cuda); + auto classical_mtx_d = Mtx::create(cuda); + load_balance_mtx_d->copy_from(load_balance_mtx.get()); + classical_mtx_d->copy_from(classical_mtx.get()); + + load_balance_mtx_d->set_strategy(automatical); + classical_mtx_d->set_strategy(automatical); + + EXPECT_EQ("load_balance", load_balance_mtx_d->get_strategy()->get_name()); + EXPECT_EQ("classical", classical_mtx_d->get_strategy()->get_name()); + ASSERT_NE(load_balance_mtx_d->get_strategy().get(), + classical_mtx_d->get_strategy().get()); +} + + } // namespace diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp index c153e1a5c74..34d2897641a 100644 --- a/cuda/test/matrix/dense_kernels.cpp +++ b/cuda/test/matrix/dense_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/dense_kernels.hpp" +#include #include @@ -40,14 +40,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include -#include #include #include -#include "core/test/utils.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { @@ -55,8 +56,12 @@ namespace { class Dense : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using ComplexMtx = gko::matrix::Dense>; + using itype = int; + using vtype = double; + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + using Arr = gko::Array; + using ComplexMtx = gko::matrix::Dense>; Dense() : rand_engine(15) {} @@ -123,6 +128,22 @@ class Dense : public ::testing::Test { dalpha->copy_from(alpha.get()); dbeta = Mtx::create(cuda); dbeta->copy_from(beta.get()); + + std::vector tmp(x->get_size()[0], 0); + auto rng = std::default_random_engine{}; + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rng); + std::vector tmp2(x->get_size()[1], 0); + std::iota(tmp2.begin(), tmp2.end(), 0); + std::shuffle(tmp2.begin(), tmp2.end(), rng); + rpermute_idxs = + std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + drpermute_idxs = + std::unique_ptr(new Arr{cuda, tmp.begin(), tmp.end()}); + cpermute_idxs = + std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); + dcpermute_idxs = + std::unique_ptr(new Arr{cuda, tmp2.begin(), tmp2.end()}); } std::shared_ptr ref; @@ -142,6 +163,10 @@ class Dense : public ::testing::Test { std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; + std::unique_ptr rpermute_idxs; + std::unique_ptr drpermute_idxs; + std::unique_ptr cpermute_idxs; + std::unique_ptr dcpermute_idxs; }; @@ -238,11 +263,14 @@ TEST_F(Dense, MultipleVectorCudaComputeDotIsEquivalentToRef) TEST_F(Dense, CudaComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->cuda, norm_size); - x->compute_norm2(expected.get()); - dx->compute_norm2(dresult.get()); + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14); } @@ -400,6 +428,18 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef) } +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(cuda); + auto dsellp_mtx = gko::matrix::Sellp<>::create(cuda); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(cuda->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + TEST_F(Dense, CountNNZIsEquivalentToRef) { set_up_apply_data(); @@ -463,4 +503,52 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) } +TEST_F(Dense, IsRowPermutable) +{ + set_up_apply_data(); + + auto r_permute = x->row_permute(rpermute_idxs.get()); + auto dr_permute = dx->row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), + static_cast(dr_permute.get()), 0); +} + + +TEST_F(Dense, IsColPermutable) +{ + set_up_apply_data(); + + auto c_permute = x->column_permute(cpermute_idxs.get()); + auto dc_permute = dx->column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), + static_cast(dc_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseRowPermutable) +{ + set_up_apply_data(); + + auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), + static_cast(d_inverse_r_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseColPermutable) +{ + set_up_apply_data(); + + auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), + static_cast(d_inverse_c_permute.get()), 0); +} + + } // namespace diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index ff4ae0b8b88..d913d80e722 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -48,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/ell_kernels.hpp" +#include "core/test/utils.hpp" +#include "cuda/test/utils.hpp" namespace { diff --git a/cuda/test/matrix/hybrid_kernels.cpp b/cuda/test/matrix/hybrid_kernels.cpp index cb7ab693899..f3225882021 100644 --- a/cuda/test/matrix/hybrid_kernels.cpp +++ b/cuda/test/matrix/hybrid_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/hybrid_kernels.hpp" +#include #include @@ -39,12 +39,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include -#include + + +#include "core/matrix/hybrid_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { diff --git a/cuda/test/matrix/sellp_kernels.cpp b/cuda/test/matrix/sellp_kernels.cpp index b213c1655db..08b276374a7 100644 --- a/cuda/test/matrix/sellp_kernels.cpp +++ b/cuda/test/matrix/sellp_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -48,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/sellp_kernels.hpp" +#include "core/test/utils.hpp" +#include "cuda/test/utils.hpp" namespace { diff --git a/cuda/test/preconditioner/CMakeLists.txt b/cuda/test/preconditioner/CMakeLists.txt index a0ca5a2e38a..a4473684560 100644 --- a/cuda/test/preconditioner/CMakeLists.txt +++ b/cuda/test/preconditioner/CMakeLists.txt @@ -1 +1,2 @@ ginkgo_create_test(jacobi_kernels) +ginkgo_create_test_cpp_cuda_header(isai_kernels) diff --git a/cuda/test/preconditioner/isai_kernels.cpp b/cuda/test/preconditioner/isai_kernels.cpp new file mode 100644 index 00000000000..fb8947e9ae6 --- /dev/null +++ b/cuda/test/preconditioner/isai_kernels.cpp @@ -0,0 +1,326 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/preconditioner/isai_kernels.hpp" +#include "cuda/base/config.hpp" +#include "cuda/test/utils.hpp" + + +namespace { + + +enum struct matrix_type { lower, upper }; +class Isai : public ::testing::Test { +protected: + using value_type = double; + using index_type = gko::int32; + using Csr = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + Isai() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + cuda = gko::CudaExecutor::create(0, ref); + } + + std::unique_ptr clone_allocations(const Csr *csr_mtx) + { + if (csr_mtx->get_executor() != ref) { + return {nullptr}; + } + const auto num_elems = csr_mtx->get_num_stored_elements(); + auto sparsity = csr_mtx->clone(); + + // values are now filled with invalid data to catch potential errors + auto begin_values = sparsity->get_values(); + auto end_values = begin_values + num_elems; + std::fill(begin_values, end_values, -gko::one()); + return sparsity; + } + + void initialize_data(matrix_type type, gko::size_type n, + gko::size_type row_limit) + { + const bool for_lower_tm = type == matrix_type::lower; + auto nz_dist = std::uniform_int_distribution(1, row_limit); + auto val_dist = std::uniform_real_distribution(-1., 1.); + mtx = Csr::create(ref); + mtx = gko::test::generate_random_triangular_matrix( + n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref, + gko::dim<2>{n, n}); + inverse = clone_allocations(mtx.get()); + + d_mtx = Csr::create(cuda); + d_mtx->copy_from(mtx.get()); + d_inverse = Csr::create(cuda); + d_inverse->copy_from(inverse.get()); + } + + + std::shared_ptr ref; + std::shared_ptr cuda; + + std::default_random_engine rand_engine; + + std::unique_ptr mtx; + std::unique_ptr inverse; + + std::unique_ptr d_mtx; + std::unique_ptr d_inverse; +}; + + +TEST_F(Isai, CudaIsaiGenerateLinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 536, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(cuda, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::cuda::isai::generate_tri_inverse( + cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, CudaIsaiGenerateUinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 615, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(cuda, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::cuda::isai::generate_tri_inverse( + cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, CudaIsaiGenerateLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 554, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(cuda, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::cuda::isai::generate_tri_inverse( + cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, CudaIsaiGenerateUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 695, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(cuda, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::cuda::isai::generate_tri_inverse( + cuda, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, CudaIsaiGenerateExcessLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 518, 40); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(cuda, a1); + gko::Array da2(cuda, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(cuda, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(cuda, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get()); + gko::kernels::cuda::isai::generate_excess_system( + cuda, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, CudaIsaiGenerateExcessUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 673, 51); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(cuda, a1); + gko::Array da2(cuda, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(cuda, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(cuda, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get()); + gko::kernels::cuda::isai::generate_excess_system( + cuda, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, CudaIsaiScatterExcessSolutionLIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 572, 52); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(cuda, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = Dense::create(cuda); + de_rhs->copy_from(lend(e_rhs)); + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get()); + gko::kernels::cuda::isai::scatter_excess_solution( + cuda, da1.get_const_data(), de_rhs.get(), d_inverse.get()); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, CudaIsaiScatterExcessSolutionUIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 702, 45); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(cuda, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = Dense::create(cuda); + de_rhs->copy_from(lend(e_rhs)); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get()); + gko::kernels::cuda::isai::scatter_excess_solution( + cuda, da1.get_const_data(), de_rhs.get(), d_inverse.get()); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +} // namespace diff --git a/cuda/test/preconditioner/jacobi_kernels.cpp b/cuda/test/preconditioner/jacobi_kernels.cpp index b98e61fc41c..05ea7d766e8 100644 --- a/cuda/test/preconditioner/jacobi_kernels.cpp +++ b/cuda/test/preconditioner/jacobi_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,17 +33,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include #include #include +#include "core/test/utils.hpp" + + namespace { @@ -323,6 +325,34 @@ TEST_F(Jacobi, CudaPreconditionerEquivalentToRefWithMPW) } +TEST_F(Jacobi, CudaTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, CudaConjTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + TEST_F(Jacobi, CudaApplyEquivalentToRefWithBlockSize32) { initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111); @@ -561,6 +591,37 @@ TEST_F(Jacobi, CudaPreconditionerEquivalentToRefWithAdaptivePrecision) } +TEST_F(Jacobi, CudaTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, + CudaConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + TEST_F(Jacobi, CudaApplyEquivalentToRefWithFullPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, diff --git a/cuda/test/solver/CMakeLists.txt b/cuda/test/solver/CMakeLists.txt index e2a017962a5..32dbb96fe61 100644 --- a/cuda/test/solver/CMakeLists.txt +++ b/cuda/test/solver/CMakeLists.txt @@ -1,8 +1,9 @@ +ginkgo_create_test(bicg_kernels) ginkgo_create_test(bicgstab_kernels) ginkgo_create_test(cg_kernels) ginkgo_create_test(cgs_kernels) ginkgo_create_test(fcg_kernels) ginkgo_create_test(gmres_kernels) ginkgo_create_test(ir_kernels) -ginkgo_create_test(lower_trs_kernels) -ginkgo_create_test(upper_trs_kernels) +ginkgo_create_test_cpp_cuda_header(lower_trs_kernels) +ginkgo_create_test_cpp_cuda_header(upper_trs_kernels) diff --git a/cuda/test/solver/bicg_kernels.cpp b/cuda/test/solver/bicg_kernels.cpp new file mode 100644 index 00000000000..e58eef7e68f --- /dev/null +++ b/cuda/test/solver/bicg_kernels.cpp @@ -0,0 +1,357 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/bicg_kernels.hpp" +#include "cuda/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class Bicg : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Mtx = gko::matrix::Dense<>; + using Csr = gko::matrix::Csr; + Bicg() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + cuda = gko::CudaExecutor::create(0, ref); + + std::string file_name(gko::matrices::location_ani1_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + csr_ref = gko::read(input_file, ref); + auto csr_cuda_temp = Csr::create(cuda); + csr_cuda_temp->copy_from(gko::lend(csr_ref)); + csr_cuda = gko::give(csr_cuda_temp); + } + + void TearDown() + { + if (cuda != nullptr) { + ASSERT_NO_THROW(cuda->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 43; + b = gen_mtx(m, n); + r = gen_mtx(m, n); + z = gen_mtx(m, n); + p = gen_mtx(m, n); + q = gen_mtx(m, n); + r2 = gen_mtx(m, n); + z2 = gen_mtx(m, n); + p2 = gen_mtx(m, n); + q2 = gen_mtx(m, n); + x = gen_mtx(m, n); + beta = gen_mtx(1, n); + prev_rho = gen_mtx(1, n); + rho = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + + d_b = Mtx::create(cuda); + d_b->copy_from(b.get()); + d_r = Mtx::create(cuda); + d_r->copy_from(r.get()); + d_z = Mtx::create(cuda); + d_z->copy_from(z.get()); + d_p = Mtx::create(cuda); + d_p->copy_from(p.get()); + d_q = Mtx::create(cuda); + d_q->copy_from(q.get()); + d_r2 = Mtx::create(cuda); + d_r2->copy_from(r2.get()); + d_z2 = Mtx::create(cuda); + d_z2->copy_from(z2.get()); + d_p2 = Mtx::create(cuda); + d_p2->copy_from(p2.get()); + d_q2 = Mtx::create(cuda); + d_q2->copy_from(q2.get()); + d_x = Mtx::create(cuda); + d_x->copy_from(x.get()); + d_beta = Mtx::create(cuda); + d_beta->copy_from(beta.get()); + d_prev_rho = Mtx::create(cuda); + d_prev_rho->copy_from(prev_rho.get()); + d_rho = Mtx::create(cuda); + d_rho->copy_from(rho.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(cuda, n)); + *d_stop_status = *stop_status; + } + + void make_symetric(Mtx *mtx) + { + for (int i = 0; i < mtx->get_size()[0]; ++i) { + for (int j = i + 1; j < mtx->get_size()[1]; ++j) { + mtx->at(i, j) = mtx->at(j, i); + } + } + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + void make_spd(Mtx *mtx) + { + make_symetric(mtx); + make_diag_dominant(mtx); + } + + std::shared_ptr ref; + std::shared_ptr cuda; + + std::ranlux48 rand_engine; + + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr z; + std::unique_ptr p; + std::unique_ptr q; + std::unique_ptr r2; + std::unique_ptr z2; + std::unique_ptr p2; + std::unique_ptr q2; + std::unique_ptr x; + std::unique_ptr beta; + std::unique_ptr prev_rho; + std::unique_ptr rho; + std::unique_ptr> stop_status; + + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_z; + std::unique_ptr d_p; + std::unique_ptr d_q; + std::unique_ptr d_r2; + std::unique_ptr d_z2; + std::unique_ptr d_p2; + std::unique_ptr d_q2; + std::unique_ptr d_x; + std::unique_ptr d_beta; + std::unique_ptr d_prev_rho; + std::unique_ptr d_rho; + std::unique_ptr> d_stop_status; + std::shared_ptr csr_ref; + std::shared_ptr csr_cuda; +}; + + +TEST_F(Bicg, CudaBicgInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::initialize( + ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), + rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get()); + gko::kernels::cuda::bicg::initialize( + cuda, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), + d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(), + d_q2.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Bicg, CudaBicgStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(), + z2.get(), rho.get(), prev_rho.get(), + stop_status.get()); + gko::kernels::cuda::bicg::step_1(cuda, d_p.get(), d_z.get(), d_p2.get(), + d_z2.get(), d_rho.get(), d_prev_rho.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14); +} + + +TEST_F(Bicg, CudaBicgStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::step_2( + ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(), + rho.get(), stop_status.get()); + gko::kernels::cuda::bicg::step_2( + cuda, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), + d_q2.get(), d_beta.get(), d_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14); +} + + +TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + make_spd(mtx.get()); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = Mtx::create(cuda); + d_mtx->copy_from(mtx.get()); + auto d_x = Mtx::create(cuda); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(cuda); + d_b->copy_from(b.get()); + auto bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(cuda), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(cuda)) + .on(cuda); + auto solver = bicg_factory->generate(std::move(mtx)); + auto d_solver = d_bicg_factory->generate(std::move(d_mtx)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Bicg, ApplyWithSuiteSparseMatrixIsEquivalentToRef) +{ + auto x = gen_mtx(36, 1); + auto b = gen_mtx(36, 1); + auto d_x = Mtx::create(cuda); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(cuda); + d_b->copy_from(b.get()); + auto bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(cuda), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(cuda)) + .on(cuda); + auto solver = bicg_factory->generate(std::move(csr_ref)); + auto d_solver = d_bicg_factory->generate(std::move(csr_cuda)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/cuda/test/solver/bicgstab_kernels.cpp b/cuda/test/solver/bicgstab_kernels.cpp index 0f1a0e190e5..c809ad6a17b 100644 --- a/cuda/test/solver/bicgstab_kernels.cpp +++ b/cuda/test/solver/bicgstab_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,21 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include #include -#include +#include + + +#include "core/solver/bicgstab_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { @@ -148,8 +150,6 @@ class Bicgstab : public ::testing::Test { d_omega = Mtx::create(cuda); d_stop_status = std::unique_ptr>( new gko::Array(cuda)); - d_stop_status = std::unique_ptr>( - new gko::Array(cuda)); d_x->copy_from(x.get()); d_b->copy_from(b.get()); @@ -259,7 +259,7 @@ TEST_F(Bicgstab, CudaBicgstabInitializeIsEquivalentToRef) GKO_EXPECT_MTX_NEAR(d_beta, beta, 1e-14); GKO_EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14); GKO_EXPECT_MTX_NEAR(d_omega, omega, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/cuda/test/solver/cg_kernels.cpp b/cuda/test/solver/cg_kernels.cpp index 3c40c3f59c8..65f8d78781f 100644 --- a/cuda/test/solver/cg_kernels.cpp +++ b/cuda/test/solver/cg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include -#include +#include + + +#include "core/solver/cg_kernels.hpp" +#include "cuda/test/utils.hpp" + namespace { @@ -193,7 +196,7 @@ TEST_F(Cg, CudaCgInitializeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/cuda/test/solver/cgs_kernels.cpp b/cuda/test/solver/cgs_kernels.cpp index d5a73474147..3e49804ddab 100644 --- a/cuda/test/solver/cgs_kernels.cpp +++ b/cuda/test/solver/cgs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include -#include +#include + + +#include "core/solver/cgs_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { @@ -249,7 +251,7 @@ TEST_F(Cgs, CudaCgsInitializeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14); GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14); GKO_ASSERT_MTX_NEAR(d_gamma, gamma, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/cuda/test/solver/fcg_kernels.cpp b/cuda/test/solver/fcg_kernels.cpp index 22d7e5702a4..2b5f3ac5441 100644 --- a/cuda/test/solver/fcg_kernels.cpp +++ b/cuda/test/solver/fcg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include -#include +#include + + +#include "core/solver/fcg_kernels.hpp" +#include "cuda/test/utils.hpp" + namespace { @@ -207,7 +210,7 @@ TEST_F(Fcg, CudaFcgInitializeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); GKO_ASSERT_MTX_NEAR(d_rho_t, rho_t, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/cuda/test/solver/gmres_kernels.cpp b/cuda/test/solver/gmres_kernels.cpp index 9f731464dac..2dcd4d2653c 100644 --- a/cuda/test/solver/gmres_kernels.cpp +++ b/cuda/test/solver/gmres_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include +#include #include #include #include -#include +#include + + +#include "core/solver/gmres_kernels.hpp" +#include "cuda/test/utils.hpp" namespace { @@ -54,7 +57,14 @@ namespace { class Gmres : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = gko::default_precision; + using index_type = gko::int32; + using Mtx = gko::matrix::Dense; + using norm_type = gko::remove_complex; + using NormVector = gko::matrix::Dense; + template + using Dense = typename gko::matrix::Dense; + Gmres() : rand_engine(30) {} void SetUp() @@ -71,41 +81,39 @@ class Gmres : public ::testing::Test { } } - std::unique_ptr gen_mtx(int num_rows, int num_cols) + template + std::unique_ptr> gen_mtx(int num_rows, int num_cols) { - return gko::test::generate_random_matrix( + return gko::test::generate_random_matrix>( num_rows, num_cols, - std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::uniform_int_distribution(num_cols, num_cols), + std::normal_distribution(-1.0, 1.0), rand_engine, ref); } - void initialize_data() + void initialize_data(int nrhs = 43) { int m = 597; - int n = 43; - x = gen_mtx(m, n); - y = gen_mtx(gko::solver::default_krylov_dim, n); + x = gen_mtx(m, nrhs); + y = gen_mtx(gko::solver::default_krylov_dim, nrhs); before_preconditioner = Mtx::create_with_config_of(x.get()); - b = gen_mtx(m, n); - b_norm = gen_mtx(1, n); - krylov_bases = gen_mtx(m, (gko::solver::default_krylov_dim + 1) * n); - next_krylov_basis = gen_mtx(m, n); + b = gen_mtx(m, nrhs); + krylov_bases = gen_mtx(m * (gko::solver::default_krylov_dim + 1), nrhs); hessenberg = gen_mtx(gko::solver::default_krylov_dim + 1, - gko::solver::default_krylov_dim * n); - hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, n); - residual = gen_mtx(m, n); - residual_norm = gen_mtx(1, n); + gko::solver::default_krylov_dim * nrhs); + hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, nrhs); + residual = gen_mtx(m, nrhs); + residual_norm = gen_mtx(1, nrhs); residual_norm_collection = - gen_mtx(gko::solver::default_krylov_dim + 1, n); - givens_sin = gen_mtx(gko::solver::default_krylov_dim, n); - givens_cos = gen_mtx(gko::solver::default_krylov_dim, n); + gen_mtx(gko::solver::default_krylov_dim + 1, nrhs); + givens_sin = gen_mtx(gko::solver::default_krylov_dim, nrhs); + givens_cos = gen_mtx(gko::solver::default_krylov_dim, nrhs); stop_status = std::unique_ptr>( - new gko::Array(ref, n)); + new gko::Array(ref, nrhs)); for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { stop_status->get_data()[i].reset(); } final_iter_nums = std::unique_ptr>( - new gko::Array(ref, n)); + new gko::Array(ref, nrhs)); for (size_t i = 0; i < final_iter_nums->get_num_elems(); ++i) { final_iter_nums->get_data()[i] = 5; } @@ -117,19 +125,15 @@ class Gmres : public ::testing::Test { d_y->copy_from(y.get()); d_b = Mtx::create(cuda); d_b->copy_from(b.get()); - d_b_norm = Mtx::create(cuda); - d_b_norm->copy_from(b_norm.get()); d_krylov_bases = Mtx::create(cuda); d_krylov_bases->copy_from(krylov_bases.get()); - d_next_krylov_basis = Mtx::create(cuda); - d_next_krylov_basis->copy_from(next_krylov_basis.get()); d_hessenberg = Mtx::create(cuda); d_hessenberg->copy_from(hessenberg.get()); d_hessenberg_iter = Mtx::create(cuda); d_hessenberg_iter->copy_from(hessenberg_iter.get()); d_residual = Mtx::create(cuda); d_residual->copy_from(residual.get()); - d_residual_norm = Mtx::create(cuda); + d_residual_norm = NormVector::create(cuda); d_residual_norm->copy_from(residual_norm.get()); d_residual_norm_collection = Mtx::create(cuda); d_residual_norm_collection->copy_from(residual_norm_collection.get()); @@ -138,10 +142,10 @@ class Gmres : public ::testing::Test { d_givens_cos = Mtx::create(cuda); d_givens_cos->copy_from(givens_cos.get()); d_stop_status = std::unique_ptr>( - new gko::Array(cuda, n)); + new gko::Array(cuda, nrhs)); *d_stop_status = *stop_status; d_final_iter_nums = std::unique_ptr>( - new gko::Array(cuda, n)); + new gko::Array(cuda, nrhs)); *d_final_iter_nums = *final_iter_nums; } @@ -154,13 +158,11 @@ class Gmres : public ::testing::Test { std::unique_ptr x; std::unique_ptr y; std::unique_ptr b; - std::unique_ptr b_norm; std::unique_ptr krylov_bases; - std::unique_ptr next_krylov_basis; std::unique_ptr hessenberg; std::unique_ptr hessenberg_iter; std::unique_ptr residual; - std::unique_ptr residual_norm; + std::unique_ptr residual_norm; std::unique_ptr residual_norm_collection; std::unique_ptr givens_sin; std::unique_ptr givens_cos; @@ -171,13 +173,11 @@ class Gmres : public ::testing::Test { std::unique_ptr d_before_preconditioner; std::unique_ptr d_y; std::unique_ptr d_b; - std::unique_ptr d_b_norm; std::unique_ptr d_krylov_bases; - std::unique_ptr d_next_krylov_basis; std::unique_ptr d_hessenberg; std::unique_ptr d_hessenberg_iter; std::unique_ptr d_residual; - std::unique_ptr d_residual_norm; + std::unique_ptr d_residual_norm; std::unique_ptr d_residual_norm_collection; std::unique_ptr d_givens_sin; std::unique_ptr d_givens_cos; @@ -191,18 +191,17 @@ TEST_F(Gmres, CudaGmresInitialize1IsEquivalentToRef) initialize_data(); gko::kernels::reference::gmres::initialize_1( - ref, b.get(), b_norm.get(), residual.get(), givens_sin.get(), - givens_cos.get(), stop_status.get(), gko::solver::default_krylov_dim); + ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), + stop_status.get(), gko::solver::default_krylov_dim); gko::kernels::cuda::gmres::initialize_1( - cuda, d_b.get(), d_b_norm.get(), d_residual.get(), d_givens_sin.get(), + cuda, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get(), gko::solver::default_krylov_dim); - GKO_ASSERT_MTX_NEAR(d_b_norm, b_norm, 1e-14); GKO_ASSERT_MTX_NEAR(d_residual, residual, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } @@ -223,7 +222,7 @@ TEST_F(Gmres, CudaGmresInitialize2IsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, 1e-14); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); } @@ -233,17 +232,41 @@ TEST_F(Gmres, CudaGmresStep1IsEquivalentToRef) int iter = 5; gko::kernels::reference::gmres::step_1( - ref, next_krylov_basis.get(), givens_sin.get(), givens_cos.get(), + ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), + residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), + hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); + gko::kernels::cuda::gmres::step_1( + cuda, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), + d_residual_norm.get(), d_residual_norm_collection.get(), + d_krylov_bases.get(), d_hessenberg_iter.get(), iter, + d_final_iter_nums.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); + GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, + 1e-14); + GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14); + GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); +} + + +TEST_F(Gmres, CudaGmresStep1OnSingleRHSIsEquivalentToRef) +{ + initialize_data(1); + int iter = 5; + + gko::kernels::reference::gmres::step_1( + ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), - hessenberg_iter.get(), b_norm.get(), iter, final_iter_nums.get(), - stop_status.get()); + hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); gko::kernels::cuda::gmres::step_1( - cuda, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(), + cuda, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), - d_krylov_bases.get(), d_hessenberg_iter.get(), d_b_norm.get(), iter, + d_krylov_bases.get(), d_hessenberg_iter.get(), iter, d_final_iter_nums.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(d_next_krylov_basis, next_krylov_basis, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14); @@ -251,7 +274,7 @@ TEST_F(Gmres, CudaGmresStep1IsEquivalentToRef) 1e-14); GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); } diff --git a/cuda/test/solver/ir_kernels.cpp b/cuda/test/solver/ir_kernels.cpp index 1265f637f76..35b844274b9 100644 --- a/cuda/test/solver/ir_kernels.cpp +++ b/cuda/test/solver/ir_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,21 +33,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include +#include #include #include +#include "core/solver/ir_kernels.hpp" +#include "cuda/test/utils.hpp" + + namespace { @@ -133,4 +136,124 @@ TEST_F(Ir, ApplyIsEquivalentToRef) } +TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(cuda, mtx); + auto d_x = clone(cuda, x); + auto d_b = clone(cuda, b); + + auto ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + ref)) + .on(ref)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + cuda)) + .on(cuda)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(cuda)) + .on(cuda); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres + // iteration gets amplified by the difference in IR. + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Ir, RichardsonApplyIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(cuda, mtx); + auto d_x = clone(cuda, x); + auto d_b = clone(cuda, b); + // Forget about accuracy - Richardson is not going to converge for a random + // matrix, just check that a couple of iterations gives the same result on + // both executors + auto ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_relaxation_factor(0.9) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(cuda)) + .with_relaxation_factor(0.9) + .on(cuda); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(cuda, mtx); + auto d_x = clone(cuda, x); + auto d_b = clone(cuda, b); + auto ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + ref)) + .on(ref)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_relaxation_factor(0.9) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + cuda)) + .on(cuda)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(cuda)) + .with_relaxation_factor(0.9) + .on(cuda); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres + // iteration gets amplified by the difference in IR. + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + } // namespace diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp index c855d955635..a2cac176e8c 100644 --- a/cuda/test/solver/lower_trs_kernels.cpp +++ b/cuda/test/solver/lower_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,6 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + + #include @@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/lower_trs_kernels.hpp" -#include "core/test/utils.hpp" +#include "cuda/test/utils.hpp" namespace { diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp index cd5584ff452..92a76b1e47b 100644 --- a/cuda/test/solver/upper_trs_kernels.cpp +++ b/cuda/test/solver/upper_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,6 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + + #include @@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/upper_trs_kernels.hpp" -#include "core/test/utils.hpp" +#include "cuda/test/utils.hpp" namespace { diff --git a/cuda/test/stop/CMakeLists.txt b/cuda/test/stop/CMakeLists.txt index 5e686b8fbb4..0ba0781e077 100644 --- a/cuda/test/stop/CMakeLists.txt +++ b/cuda/test/stop/CMakeLists.txt @@ -1,2 +1,2 @@ ginkgo_create_test(criterion_kernels) -ginkgo_create_test(residual_norm_reduction_kernels) +ginkgo_create_test(residual_norm_kernels) diff --git a/cuda/test/stop/criterion_kernels.cpp b/cuda/test/stop/criterion_kernels.cpp index a690cb1fafd..8265ffea284 100644 --- a/cuda/test/stop/criterion_kernels.cpp +++ b/cuda/test/stop/criterion_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,12 +31,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include -#include #include +#include + + +#include "cuda/test/utils.hpp" + + namespace { diff --git a/cuda/test/stop/residual_norm_kernels.cpp b/cuda/test/stop/residual_norm_kernels.cpp new file mode 100644 index 00000000000..ec5dc3bf511 --- /dev/null +++ b/cuda/test/stop/residual_norm_kernels.cpp @@ -0,0 +1,369 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include "cuda/test/utils.hpp" + + +namespace { + + +constexpr double tol = 1.0e-14; + + +class ResidualNormReduction : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + + ResidualNormReduction() + { + ref_ = gko::ReferenceExecutor::create(); + cuda_ = gko::CudaExecutor::create(0, ref_); + factory_ = gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(tol) + .on(cuda_); + } + + std::unique_ptr::Factory> factory_; + std::shared_ptr cuda_; + std::shared_ptr ref_; +}; + + +TEST_F(ResidualNormReduction, WaitsTillResidualGoal) +{ + auto res = gko::initialize({100.0}, ref_); + auto d_res = Mtx::create(cuda_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = gko::initialize({10.0}, ref_); + std::shared_ptr d_rhs = Mtx::create(cuda_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + stop_status.set_executor(cuda_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0) = tol * 1.1e+2; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_FALSE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(cuda_); + ASSERT_FALSE(one_changed); + + res->at(0) = tol * 0.9e+2; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + ASSERT_TRUE(one_changed); +} + + +TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) +{ + auto res = gko::initialize({{100.0, 100.0}}, ref_); + auto d_res = Mtx::create(cuda_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = + gko::initialize({{10.0, 10.0}}, ref_); + std::shared_ptr d_rhs = Mtx::create(cuda_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.set_executor(cuda_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0, 0) = tol * 0.9e+2; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(cuda_); + ASSERT_TRUE(one_changed); + + res->at(0, 1) = tol * 0.9e+2; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[1].has_converged()); + ASSERT_TRUE(one_changed); +} + + +class RelativeResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + + RelativeResidualNorm() + { + ref_ = gko::ReferenceExecutor::create(); + cuda_ = gko::CudaExecutor::create(0, ref_); + factory_ = + gko::stop::RelativeResidualNorm<>::build().with_tolerance(tol).on( + cuda_); + } + + std::unique_ptr::Factory> factory_; + std::shared_ptr cuda_; + std::shared_ptr ref_; +}; + + +TEST_F(RelativeResidualNorm, WaitsTillResidualGoal) +{ + auto res = gko::initialize({100.0}, ref_); + auto d_res = Mtx::create(cuda_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = gko::initialize({10.0}, ref_); + std::shared_ptr d_rhs = Mtx::create(cuda_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + stop_status.set_executor(cuda_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0) = tol * 1.1e+1; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_FALSE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(cuda_); + ASSERT_FALSE(one_changed); + + res->at(0) = tol * 0.9e+1; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + ASSERT_TRUE(one_changed); +} + + +TEST_F(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + auto res = gko::initialize({{100.0, 100.0}}, ref_); + auto d_res = Mtx::create(cuda_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = + gko::initialize({{10.0, 10.0}}, ref_); + std::shared_ptr d_rhs = Mtx::create(cuda_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.set_executor(cuda_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0, 0) = tol * 0.9e+1; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(cuda_); + ASSERT_TRUE(one_changed); + + res->at(0, 1) = tol * 0.9e+1; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[1].has_converged()); + ASSERT_TRUE(one_changed); +} + + +class AbsoluteResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + + AbsoluteResidualNorm() + { + ref_ = gko::ReferenceExecutor::create(); + cuda_ = gko::CudaExecutor::create(0, ref_); + factory_ = + gko::stop::AbsoluteResidualNorm<>::build().with_tolerance(tol).on( + cuda_); + } + + std::unique_ptr::Factory> factory_; + std::shared_ptr cuda_; + std::shared_ptr ref_; +}; + + +TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoal) +{ + auto res = gko::initialize({100.0}, ref_); + auto d_res = Mtx::create(cuda_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = gko::initialize({10.0}, ref_); + std::shared_ptr d_rhs = Mtx::create(cuda_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + stop_status.set_executor(cuda_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0) = tol * 1.1; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_FALSE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(cuda_); + ASSERT_FALSE(one_changed); + + res->at(0) = tol * 0.9; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + ASSERT_TRUE(one_changed); +} + + +TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + auto res = gko::initialize({{100.0, 100.0}}, ref_); + auto d_res = Mtx::create(cuda_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = + gko::initialize({{10.0, 10.0}}, ref_); + std::shared_ptr d_rhs = Mtx::create(cuda_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.set_executor(cuda_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0, 0) = tol * 0.9; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(cuda_); + ASSERT_TRUE(one_changed); + + res->at(0, 1) = tol * 0.9; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[1].has_converged()); + ASSERT_TRUE(one_changed); +} + + +} // namespace diff --git a/cuda/test/stop/residual_norm_reduction_kernels.cpp b/cuda/test/stop/residual_norm_reduction_kernels.cpp deleted file mode 100644 index 9190590ebd7..00000000000 --- a/cuda/test/stop/residual_norm_reduction_kernels.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include - - -namespace { - - -constexpr double reduction_factor = 1.0e-14; - - -class ResidualNormReduction : public ::testing::Test { -protected: - using Mtx = gko::matrix::Dense<>; - - ResidualNormReduction() - { - ref_ = gko::ReferenceExecutor::create(); - cuda_ = gko::CudaExecutor::create(0, ref_); - factory_ = gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(reduction_factor) - .on(cuda_); - } - - std::unique_ptr::Factory> factory_; - std::shared_ptr cuda_; - std::shared_ptr ref_; -}; - - -TEST_F(ResidualNormReduction, WaitsTillResidualGoal) -{ - auto scalar = gko::initialize({1.0}, ref_); - auto d_scalar = Mtx::create(cuda_); - d_scalar->copy_from(scalar.get()); - auto criterion = - factory_->generate(nullptr, nullptr, nullptr, d_scalar.get()); - bool one_changed{}; - constexpr gko::uint8 RelativeStoppingId{1}; - gko::Array stop_status(ref_, 1); - stop_status.get_data()[0].reset(); - stop_status.set_executor(cuda_); - - ASSERT_FALSE( - criterion->update() - .residual_norm(d_scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - - scalar->at(0) = reduction_factor * 1.0e+2; - d_scalar->copy_from(scalar.get()); - ASSERT_FALSE( - criterion->update() - .residual_norm(d_scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - stop_status.set_executor(ref_); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); - stop_status.set_executor(cuda_); - ASSERT_EQ(one_changed, false); - - scalar->at(0) = reduction_factor * 1.0e-2; - d_scalar->copy_from(scalar.get()); - ASSERT_TRUE( - criterion->update() - .residual_norm(d_scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - stop_status.set_executor(ref_); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); - ASSERT_EQ(one_changed, true); -} - - -TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) -{ - auto mtx = gko::initialize({{1.0, 1.0}}, ref_); - auto d_mtx = Mtx::create(cuda_); - d_mtx->copy_from(mtx.get()); - auto criterion = factory_->generate(nullptr, nullptr, nullptr, d_mtx.get()); - bool one_changed{}; - constexpr gko::uint8 RelativeStoppingId{1}; - gko::Array stop_status(ref_, 2); - stop_status.get_data()[0].reset(); - stop_status.get_data()[1].reset(); - stop_status.set_executor(cuda_); - - ASSERT_FALSE( - criterion->update() - .residual_norm(d_mtx.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - - mtx->at(0, 0) = reduction_factor * 1.0e-2; - d_mtx->copy_from(mtx.get()); - ASSERT_FALSE( - criterion->update() - .residual_norm(d_mtx.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - stop_status.set_executor(ref_); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); - stop_status.set_executor(cuda_); - ASSERT_EQ(one_changed, true); - - mtx->at(0, 1) = reduction_factor * 1.0e-2; - d_mtx->copy_from(mtx.get()); - ASSERT_TRUE( - criterion->update() - .residual_norm(d_mtx.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - stop_status.set_executor(ref_); - ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); - ASSERT_EQ(one_changed, true); -} - - -} // namespace diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp new file mode 100644 index 00000000000..903ed6a77c3 --- /dev/null +++ b/cuda/test/utils.hpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_TEST_UTILS_HPP_ +#define GKO_CUDA_TEST_UTILS_HPP_ + + +#include "core/test/utils.hpp" + + +#include + + +namespace { + + +// prevent device reset after each test +auto no_reset_exec = + gko::CudaExecutor::create(0, gko::ReferenceExecutor::create(), true); + + +} // namespace + + +#endif // GKO_CUDA_TEST_UTILS_HPP_ diff --git a/cuda/test/utils/assertions_test.cpp b/cuda/test/utils/assertions_test.cpp index d5e385eea8a..71a2fb0109b 100644 --- a/cuda/test/utils/assertions_test.cpp +++ b/cuda/test/utils/assertions_test.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - -#include +#include "core/test/utils/assertions.hpp" #include @@ -41,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/test/utils.hpp" + + namespace { @@ -70,10 +72,10 @@ TEST_F(MatricesNear, CanPassCudaMatrix) auto mtx = gko::initialize>( {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, ref); // TODO: CUDA conversion Dense -> Csr not yet implemented - auto csr_omp = gko::matrix::Csr<>::create(ref); - csr_omp->copy_from(mtx.get()); + auto csr_ref = gko::matrix::Csr<>::create(ref); + csr_ref->copy_from(mtx.get()); auto csr_mtx = gko::matrix::Csr<>::create(cuda); - csr_mtx->copy_from(std::move(csr_omp)); + csr_mtx->copy_from(std::move(csr_ref)); GKO_EXPECT_MTX_NEAR(csr_mtx, mtx, 0.0); GKO_ASSERT_MTX_NEAR(csr_mtx, mtx, 0.0); diff --git a/dev_tools/containers/README.md b/dev_tools/containers/README.md deleted file mode 100644 index 7e3a1818f2d..00000000000 --- a/dev_tools/containers/README.md +++ /dev/null @@ -1,161 +0,0 @@ -# Purpose -The purpose of this file is to explain how to create or use containers for Ginkgo. - -Custom containers are used in Ginkgo in order to test the correct functionality -of the library. As Ginkgo is a C++ CUDA-enabled library, it is important to test -both a wide variety of compilers and CUDA versions as part of the development -process. This allows to ensure Ginkgo is and stays compatible with the specified -compilers and CUDA versions. -# Tools used -To create and deploy containers, we will use: -+ [NVIDIA's container registry](https://ngc.nvidia.com/registry/nvidia-cuda) -+ [NVIDIA HPC Container Maker (HPCCM)](https://github.com/NVIDIA/hpc-container-maker/) -+ [nvidia-docker2](https://github.com/NVIDIA/nvidia-docker) should be installed and available -+ A [local docker registry](https://docs.docker.com/registry/deploying/#run-a-local-registry) should be up and running -+ docker and gitlab-runner -# Ginkgo containers -Creating container images is a tedious task. The [usual -process](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/) -requires writing what is called a `Dockerfile` which contains all commands -needed to build an image. - -To facilitate building new docker images, it is advised to start with an already -existing container image (such as an ubuntu image), and extend it with new -functionalities to generate a new container. In our context this is what we will -be doing. Nevertheless, to facilitate container generation we have decided to -rely on NVIDIA's HPCCM. - -## Ginkgo HPCCM recipes -HPCCM facilitates the container creation process significantly through a -high-level interface. HPCCM uses 'recipes', python files containing base -instructions, similar to a cookbook, tailored to generate Dockerfiles. Recipes -can take in arguments which allows to reuse the same recipe for building a wide -variety of containers. By default, HPCCM supports multiple packages and Linux -distributions which increases the portability of the HPCCM recipes. - -### Description -Ginkgo provides two recipes for creating containers. They are : -+ ginkgo-cuda-base.py: based on [NVIDIA's docker images](https://ngc.nvidia.com/registry/nvidia-cuda) -+ ginkgo-nocuda-base.py: based on the basic ubuntu image - -There is minor differences, but all of Ginkgo's recipes install the following -packages: -+ GNU compilers -+ LLVM/Clang -+ Intel Compilers -+ OpenMP -+ Python 2 and 3 -+ cmake -+ git, openssh, doxygen, curl (these are required for some synchronization or - documentation building jobs) -+ valgrind, graphviz, jq (documentation and debugging) - -### CUDA recipes -Every container is tailored to have matching CUDA, GNU Compilers and LLVM/Clang -versions. The information for compatible versions can usually be found in -[NVIDIA's -documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). - -+ CUDA is provided by default from nvidia-cuda, and requires no particular setup. -+ GNU and Clang compilers should use the extra_packages argument in order to - have access to a repository providing all compiler versions (otherwise the - default limit is gcc 5.4). -+ Arguments can be provided for CUDA, GNU and LLVM version. -+ It is required to use `libomp-dev` library for Clang+OpenMP to work. -+ hwloc is built and the server's topology is added to the container. -+ Finally, `LIBRARY_PATH` and `LD_LIBRARY_PATH` are properly setup for the CUDA - library. For proper CMake detection of the GPUs, this should maybe be - extended. - - -The dockerfiles and container images already generated are: -+ CUDA 9.0, GNU 5.5, LLVM 3.9, no Intel -+ CUDA 9.1, GNU 6, LLVM 4.0, Intel 2017 update 4 -+ CUDA 9.2, GNU 7, LLVM 5.0, Intel 2017 update 4 -+ CUDA 10.0, GNU 7, LLVM 6.0, Intel 2018 update 1 -+ CUDA 10.1, GNU 8, LLVM 7, Intel 2019 update 4 - -### No CUDA recipe -Because CUDA limits the versions of compilers it can work with, it is good -practice to provide non-CUDA containers, particularly for the more recent -compilers. - -The base image for this recipe is the same Ubuntu version as NVIDIA's to keep -the systems as similar as possible. There is only one extra difference for this -recipe: the image is very light and does not include the `make` command by -default, so it is necessary to add the `build-essential` package to the -requirements. - -In addition to the previous argument, an extra `papi` argument can be given. -This argument can be set to `True` to indicate that the image should be built -for papi support. In this case, if papi files can be found, the library perfmon -(`libpfm4`) is added do the docker container and papi files are copied to the -container from a folder named `papi/` with the following format: -+ `papi/include`: papi include files -+ `papi/lib`: papi pre-built library files -+ `papi/bin`: papi pre-built binary files - -The dockerfiles and container images already generated are: -+ GNU 9, LLVM 8 , Intel 2019 update 4. -## Using HPCCM recipes and docker to create containers -The following explains how to use recipes and docker to create new containers. -### Generate the Dockerfile -This is done with the NVIDIA's HPCCM tool. A base recipe should be given and the - output should be written to a file, the dockerfile. -```bash - hpccm --recipe ginkgo-cuda-base.py --userarg cuda=10.0 gnu=8 llvm=6.0 > gko-cuda100-gnu8-llvm60.baseimage -``` -### Using docker to build the container -The command simply uses `docker build` the standard command for container -generation. -```bash -docker build -t localhost:5000/gko-cuda100-gnu8-llvm60 -f gko-cuda100-gnu8-llvm60.baseimage . -``` -A name is given to the image through `-t tag`. It is required to append -`localhost:5000/` to designate our server's local container registry. -The base image (or dockerfile) is given through the `-f` argument. -The path given here is `.`. This is important if building an image from the -`gko-nocuda-base` base image. This indicates the path where the relevant papi -pre-built files (`papi/include/...`, etc) files to be put into the container can -be found. -### Test the generated container -The created container should be tested to ensure all supposed functionalities -are present and properly working. Here is a standard procedure for this: -```bash -# get interactive access to a container -docker run --rm --runtime=nvidia -ti localhost:5000/gko-cuda100-gnu8-llvm60 -nvidia-smi -g++ --version -clang++ --version -``` - -In addition, it can be useful to test the CUDA and OpenMP functionality, for -this purpose a short C-program can be created such as: -```c++ - #include - #include - #define SOME_LIMIT 10000 - - int main() - { - cuInit(0); //whatever other CUDA standard API call - - int acc = 0; - #pragma omp parallel for - for (int i=0; i gko-nocuda-gnu8-llvm70.baseimage -list=('gko-nocuda-gnu8-llvm70.baseimage') -hpccm --recipe ginkgo-nocuda-base.py --userarg gnu=9 llvm=8 papi=True > gko-nocuda-gnu9-llvm8.baseimage -list+=('gko-nocuda-gnu9-llvm8.baseimage') -if [ "$HOSTNAME" = "amdci" ]; then - list+=('gko-amd-gnu7-llvm60.baseimage') -else - hpccm --recipe ginkgo-cuda-base.py --userarg cuda=10.1 gnu=8 llvm=7 > gko-cuda101-gnu8-llvm70.baseimage - hpccm --recipe ginkgo-cuda-base.py --userarg cuda=10.0 gnu=7 llvm=6.0 > gko-cuda100-gnu7-llvm60.baseimage - hpccm --recipe ginkgo-cuda-base.py --userarg cuda=9.2 gnu=7 llvm=5.0 > gko-cuda92-gnu7-llvm50.baseimage - hpccm --recipe ginkgo-cuda-base.py --userarg cuda=9.1 gnu=6 llvm=4.0 > gko-cuda91-gnu6-llvm40.baseimage - hpccm --recipe ginkgo-cuda-base.py --userarg cuda=9.0 gnu=5 llvm=3.9 > gko-cuda90-gnu5-llvm39.baseimage - list+=(gko-cuda*.baseimage) -fi - -for i in "${list[@]}" -do - name=$(echo $i | cut -d"." -f1) - docker build -t localhost:5000/$name -f $i . - docker push localhost:5000/$name -done diff --git a/dev_tools/containers/ginkgo-cuda-base.py b/dev_tools/containers/ginkgo-cuda-base.py deleted file mode 100644 index 636ccd02015..00000000000 --- a/dev_tools/containers/ginkgo-cuda-base.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Ginkgo Base image -Contents: - CUDA version set by the user - GNU compilers version set by the user - LLVM/Clang clang-tidy version set by the user - Intel ICC and ICPC version set according to the CUDA version - OpenMP latest apt version for Clang+OpenMP - Python 2 and 3 (upstream) - cmake (upstream) - git, openssh, doxygen, curl, valgrind, graphviz, jq latest apt version - build-essential, automake, pkg-config, libtool, latest apt version - iwyu precompiled version 6.0 - libthrust-dev latest apt version - gnupg-agent: latest apt version, for adding custom keys -""" -# pylint: disable=invalid-name, undefined-variable, used-before-assignment - -import os - -cuda_version = USERARG.get('cuda', '10.0') - -release_name = 'xenial' -image = 'nvidia/cuda:{}-devel-ubuntu16.04'.format(cuda_version) -Stage0.baseimage(image) - - -# Correctly set the LIBRARY_PATH -Stage0 += environment(variables={'CUDA_INSTALL_PATH': '/usr/local/cuda/'}) -Stage0 += environment(variables={'CUDA_PATH': '/usr/local/cuda/'}) -Stage0 += environment(variables={'CUDA_ROOT': '/usr/local/cuda/'}) -Stage0 += environment(variables={'CUDA_SDK': '/usr/local/cuda/'}) -Stage0 += environment(variables={'CUDA_INC_PATH': '/usr/local/cuda/include'}) -Stage0 += environment(variables={'PATH': '$PATH:/usr/local/cuda/bin'}) -Stage0 += environment(variables={'LIBRARY_PATH': '$LIBRARY_PATH:/usr/local/cuda/lib64/stubs'}) -Stage0 += environment(variables={'LD_LIBRARY_PATH': '$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs'}) -Stage0 += environment(variables={'LD_RUN_PATH': 'usr/local/cuda/lib64/stubs'}) -Stage0 += environment(variables={'INCLUDEPATH': '/usr/local/cuda/include'}) -Stage0 += environment(variables={'CPATH': '/usr/local/cuda/include'}) -Stage0 += environment(variables={'MANPATH': '/usr/local/cuda/doc/man'}) - - -# Setup extra tools -Stage0 += python() -Stage0 += cmake(eula=True, version='3.14.5') -Stage0 += apt_get(ospackages=['git', 'openssh-client', 'doxygen', 'curl', 'valgrind', 'graphviz']) -Stage0 += apt_get(ospackages=['jq', 'iwyu']) -Stage0 += apt_get(ospackages=['build-essential', 'automake', 'pkg-config', 'libtool']) -Stage0 += apt_get(ospackages=['libthrust-dev']) -Stage0 += apt_get(ospackages=['gnupg-agent']) -Stage0 += apt_get(ospackages=['ca-certificates']) # weird github certificates problem - -# GNU compilers -gnu_version = USERARG.get('gnu', '7') -Stage0 += gnu(version=gnu_version, extra_repository=True) -gcov_update = ['update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-{} 90'.format(gnu_version)] -Stage0 += shell(commands=gcov_update) - -# Clang compilers -llvm_version = USERARG.get('llvm', '7') -clang_ver = 'clang-{}'.format(llvm_version) -repo_ver = ['deb http://apt.llvm.org/{}/ llvm-toolchain-{}-{} main'.format(release_name, release_name, llvm_version)] -Stage0 += apt_get(ospackages=[clang_ver, 'libomp-dev'], repositories=repo_ver, keys=['https://apt.llvm.org/llvm-snapshot.gpg.key']) -clang_update = 'update-alternatives --install /usr/bin/clang clang /usr/bin/clang-{} 90'.format(llvm_version) -clangpp_update = 'update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang-{} 90'.format(llvm_version) -Stage0 += shell(commands=[clang_update, clangpp_update]) - - -# clang-tidy -clangtidy = ['clang-tidy-{}'.format(llvm_version)] -Stage0 += packages(apt_ppas=['ppa:xorg-edgers/ppa'], apt=clangtidy) -clangtidyln = ['ln -s /usr/bin/clang-tidy-{} /usr/bin/clang-tidy'.format(llvm_version)] -Stage0 += shell(commands=clangtidyln) - -# IWYU -if os.path.isdir('bin/'): - Stage0 += copy(src='bin/*', dest='/usr/bin/') - -if os.path.isdir('sonar-scanner/') and float(cuda_version) >= float(10.0): - Stage0 += copy(src='sonar-scanner/', dest='/') - -# hwloc -if float(cuda_version) >= float(9.2): - Stage0 += shell(commands=['cd /var/tmp', - 'git clone https://github.com/open-mpi/hwloc.git hwloc']) - Stage0 += shell(commands=['cd /var/tmp/hwloc', './autogen.sh', - './configure --prefix=/usr --disable-nvml', 'make -j10', 'make install']) - Stage0 += shell(commands=['rm -rf /var/tmp/hwloc']) - - # upload valid FineCI topology and set it for hwloc - if os.path.isfile('topology/fineci.xml'): - Stage0 += copy(src='topology/fineci.xml', dest='/') - Stage0 += environment(variables={'HWLOC_XMLFILE': '/fineci.xml'}) - Stage0 += environment(variables={'HWLOC_THISSYSTEM': '1'}) - - -# Convert from CUDA version to Intel Compiler years -intel_versions = {'9.0' : '2017', '9.1' : '2017', '9.2' : '2017', '10.0' : '2018', '10.1' : '2019'} -intel_path = 'intel/parallel_studio_xe_{}/compilers_and_libraries/linux/'.format(intel_versions.get(cuda_version)) -if os.path.isdir(intel_path): - Stage0 += copy(src=intel_path+'bin/intel64/', dest='/opt/intel/bin/') - Stage0 += copy(src=intel_path+'lib/intel64/', dest='/opt/intel/lib/') - Stage0 += copy(src=intel_path+'include/', dest='/opt/intel/include/') - Stage0 += environment(variables={'INTEL_LICENSE_FILE': '28518@scclic1.scc.kit.edu'}) - Stage0 += environment(variables={'PATH': '$PATH:/opt/intel/bin'}) - Stage0 += environment(variables={'LIBRARY_PATH': '$LIBRARY_PATH:/opt/intel/lib'}) - Stage0 += environment(variables={'LD_LIBRARY_PATH': '$LD_LIBRARY_PATH:/opt/intel/lib'}) - Stage0 += environment(variables={'LD_RUN_PATH': '$LD_RUN_PATH:/opt/intel/lib'}) - - -# HIP -Stage0 += shell(commands=['cd /var/tmp', - 'git clone https://github.com/ROCm-Developer-Tools/HIP.git']) -Stage0 += shell(commands=['cd /var/tmp/HIP', 'mkdir build', 'cd build', - 'cmake ..', 'make install']) -Stage0 += shell(commands=['rm -rf /var/tmp/HIP']) -Stage0 += shell(commands=['cd /var/tmp', - 'git clone https://github.com/tcojean/hipBLAS.git']) -Stage0 += shell(commands=['cd /var/tmp/hipBLAS', 'mkdir build', 'cd build', - 'cmake ..', 'make install']) -Stage0 += shell(commands=['rm -rf /var/tmp/hipBLAS']) -Stage0 += shell(commands=['cd /var/tmp', - 'git clone https://github.com/tcojean/hipSPARSE.git']) -Stage0 += shell(commands=['cd /var/tmp/hipSPARSE', 'mkdir build', 'cd build', - 'cmake -DBUILD_CUDA=on ..', 'make install']) -Stage0 += shell(commands=['rm -rf /var/tmp/hipSPARSE']) diff --git a/dev_tools/containers/ginkgo-nocuda-base.py b/dev_tools/containers/ginkgo-nocuda-base.py deleted file mode 100644 index 34b6dd78eb6..00000000000 --- a/dev_tools/containers/ginkgo-nocuda-base.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Ginkgo Base image -Contents: - GNU compilers version set by the user - LLVM/Clang version set by the user - Intel ICC and ICPC version set to the latest available version - OpenMP latest apt version for Clang+OpenMP - Python 2 and 3 (upstream) - cmake (upstream) - build-essential, git, openssh, curl, valgrind latest apt version - jq, graphviz, ghostscript, latest apt version - bison, flex latest apt version, required for doxygen compilation - doxygen: install the latest release - texlive: install the latest release - clang-tidy, iwyu: latest apt version - hwloc, libhwloc-dev, pkg-config latest apt version - papi: adds package libpfm4, and copy precompiled papi headers and files - from a directory called 'papi' - gpg-agent: latest apt version, for adding custom keys -""" -# pylint: disable=invalid-name, undefined-variable, used-before-assignment - -import os - -Stage0.baseimage('ubuntu:18.04') -release_name = 'bionic' - -# Setup extra tools -Stage0 += python() -Stage0 += cmake(eula=True) -Stage0 += apt_get(ospackages=['build-essential', 'git', 'openssh-client', 'curl', 'valgrind']) -Stage0 += apt_get(ospackages=['jq', 'graphviz', 'ghostscript']) -Stage0 += apt_get(ospackages=['clang-tidy', 'iwyu']) -Stage0 += apt_get(ospackages=['hwloc', 'libhwloc-dev', 'pkg-config']) -Stage0 += apt_get(ospackages=['gpg-agent']) -Stage0 += apt_get(ospackages=['ca-certificates']) # weird github certificates problem -Stage0 += apt_get(ospackages=['bison', 'flex']) - -# GNU compilers -gnu_version = USERARG.get('gnu', '9') -Stage0 += gnu(version=gnu_version, extra_repository=True) - -# Clang compilers -llvm_version = USERARG.get('llvm', '8') -clang_ver = 'clang-{}'.format(llvm_version) -repo_ver = ['deb http://apt.llvm.org/{}/ llvm-toolchain-{}-{} main'.format(release_name, release_name, llvm_version)] -Stage0 += apt_get(ospackages=[clang_ver, 'libomp-dev'], repositories=repo_ver, keys=['https://apt.llvm.org/llvm-snapshot.gpg.key']) -clang_update = 'update-alternatives --install /usr/bin/clang clang /usr/bin/clang-{} 90'.format(llvm_version) -clangpp_update = 'update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang-{} 90'.format(llvm_version) -Stage0 += shell(commands=[clang_update, clangpp_update]) - -# Doxygen -Stage0 += shell(commands=['cd /var/tmp', 'git clone https://github.com/doxygen/doxygen']) -Stage0 += shell(commands=['cd /var/tmp/doxygen', 'git checkout Release_1_8_16', - 'mkdir build', 'cd build', - 'cmake ..', 'make -j10', 'make install']) -Stage0 += shell(commands=['cd /var/tmp', 'rm -rf doxygen']) - -# Texlive -if os.path.isdir('texlive/'): - Stage0 += copy(src='texlive/texlive.profile', dest='/var/tmp') - Stage0 += shell(commands=['cd /var/tmp', 'wget ' - 'http://mirror.ctan.org/systems/texlive/tlnet/install-tl-unx.tar.gz', - 'tar -xvf install-tl-unx.tar.gz', 'cd install-tl-2*', - './install-tl --profile=../texlive.profile']) - Stage0 += shell(commands=['cd /var/tmp', 'rm -rf install-tl*']) - Stage0 += shell(commands=['tlmgr install mathtools float xcolor varwidth ' - 'fancyvrb multirow hanging adjustbox xkeyval ' - 'collectbox stackengine etoolbox listofitems ulem ' - 'wasysym sectsty tocloft newunicodechar caption etoc ' - 'pgf ec helvetic courier wasy']) - -# Copy PAPI libs -add_papi = USERARG.get('papi', 'False') -if os.path.isdir('papi/') and add_papi == 'True': - Stage0 += apt_get(ospackages=['libpfm4']) - Stage0 += copy(src='papi/include/*', dest='/usr/include/') - Stage0 += copy(src='papi/lib/*', dest='/usr/lib/') - Stage0 += copy(src='papi/bin/*', dest='/usr/bin/') - -intel_path = 'intel/parallel_studio_xe_2019/compilers_and_libraries/linux/' -if os.path.isdir(intel_path): - Stage0 += copy(src=intel_path+'bin/intel64/', dest='/opt/intel/bin/') - Stage0 += copy(src=intel_path+'lib/intel64/', dest='/opt/intel/lib/') - Stage0 += copy(src=intel_path+'include/', dest='/opt/intel/include/') - Stage0 += environment(variables={'INTEL_LICENSE_FILE': '28518@scclic1.scc.kit.edu'}) - Stage0 += environment(variables={'PATH': '$PATH:/opt/intel/bin'}) - Stage0 += environment(variables={'LIBRARY_PATH': '$LIBRARY_PATH:/opt/intel/lib'}) - Stage0 += environment(variables={'LD_LIBRARY_PATH': '$LD_LIBRARY_PATH:/opt/intel/lib'}) - Stage0 += environment(variables={'LD_RUN_PATH': '$LD_RUN_PATH:/opt/intel/lib'}) diff --git a/dev_tools/containers/gko-amd-gnu7-llvm60.baseimage b/dev_tools/containers/gko-amd-gnu7-llvm60.baseimage deleted file mode 100644 index 018129dd792..00000000000 --- a/dev_tools/containers/gko-amd-gnu7-llvm60.baseimage +++ /dev/null @@ -1,75 +0,0 @@ -FROM ubuntu:16.04 -MAINTAINER Terry Cojean - -# Initialize the image -# Modify to pre-install dev tools and ROCm packages -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl && \ - curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add - && \ - sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list' && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libelf1 \ - libnuma-dev \ - build-essential \ - git \ - vim-nox \ - cmake-curses-gui \ - kmod \ - rocm-dev \ - hipsparse hipblas rocthrust && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - wget && \ - rm -rf /var/lib/apt/lists/* -RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://cmake.org/files/v3.11/cmake-3.11.1-Linux-x86_64.sh && \ - /bin/sh /var/tmp/cmake-3.11.1-Linux-x86_64.sh --prefix=/usr/local --skip-license && \ - rm -rf /var/tmp/cmake-3.11.1-Linux-x86_64.sh - - -# GNU compiler -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends software-properties-common && \ - apt-add-repository ppa:ubuntu-toolchain-r/test -y && \ - apt-get update -y && \ - apt-get install -y --no-install-recommends \ - gcc-7 \ - g++-7 \ - gfortran-7 && \ - rm -rf /var/lib/apt/lists/* - -RUN update-alternatives --install /usr/bin/gcc gcc $(which gcc-7) 30 && \ - update-alternatives --install /usr/bin/g++ g++ $(which g++-7) 30 && \ - update-alternatives --install /usr/bin/gfortran gfortran $(which gfortran-7) 30 - -# LLVM compiler -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - clang-6.0 && \ - rm -rf /var/lib/apt/lists/* -RUN update-alternatives --install /usr/bin/clang clang $(which clang-6.0) 30 && \ - update-alternatives --install /usr/bin/clang++ clang++ $(which clang++-6.0) 30 - -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - libomp-dev && \ - rm -rf /var/lib/apt/lists/* - - -# sed --in-place=.rocm-backup 's|^\(PATH=.*\)"$|\1:/opt/rocm/bin"|' /etc/environment - -USER root -WORKDIR /root -ENV PATH "${PATH}:/opt/rocm/bin" - -# The following are optional enhancements for the command-line experience -# Uncomment the following to install a pre-configured vim environment based on http://vim.spf13.com/ -# 1. Sets up an enhanced command line dev environment within VIM -# 2. Aliases GDB to enable TUI mode by default -#RUN curl -sL https://j.mp/spf13-vim3 | bash && \ -# echo "alias gdb='gdb --tui'\n" >> ~/.bashrc - -# Default to a login shell -CMD ["bash", "-l"] diff --git a/dev_tools/containers/texlive/texlive.profile b/dev_tools/containers/texlive/texlive.profile deleted file mode 100644 index 3c0cbaabe68..00000000000 --- a/dev_tools/containers/texlive/texlive.profile +++ /dev/null @@ -1,30 +0,0 @@ -# texlive.profile written on Fri Oct 11 15:47:26 2019 UTC -# It will NOT be updated and reflects only the -# installation profile at installation time. -selected_scheme scheme-basic -TEXDIR /usr/local/texlive/2019 -TEXMFCONFIG ~/.texlive2019/texmf-config -TEXMFHOME ~/texmf -TEXMFLOCAL /usr/local/texlive/texmf-local -TEXMFSYSCONFIG /usr/local/texlive/2019/texmf-config -TEXMFSYSVAR /usr/local/texlive/2019/texmf-var -TEXMFVAR ~/.texlive2019/texmf-var -binary_x86_64-linux 1 -instopt_adjustpath 1 -instopt_adjustrepo 1 -instopt_letter 0 -instopt_portable 0 -instopt_write18_restricted 1 -tlpdbopt_autobackup 1 -tlpdbopt_backupdir tlpkg/backups -tlpdbopt_create_formats 1 -tlpdbopt_desktop_integration 1 -tlpdbopt_file_assocs 1 -tlpdbopt_generate_updmap 0 -tlpdbopt_install_docfiles 1 -tlpdbopt_install_srcfiles 1 -tlpdbopt_post_code 1 -tlpdbopt_sys_bin /usr/bin -tlpdbopt_sys_info /usr/info -tlpdbopt_sys_man /usr/man -tlpdbopt_w32_multi_user 1 \ No newline at end of file diff --git a/dev_tools/scripts/add_license.ignore b/dev_tools/scripts/add_license.ignore new file mode 100644 index 00000000000..cfcb6f4adaa --- /dev/null +++ b/dev_tools/scripts/add_license.ignore @@ -0,0 +1,3 @@ +build/ +third_party/ +external-lib-interfacing.cpp \ No newline at end of file diff --git a/dev_tools/scripts/add_license.sh b/dev_tools/scripts/add_license.sh index 85d73595ebd..34caf4146b9 100755 --- a/dev_tools/scripts/add_license.sh +++ b/dev_tools/scripts/add_license.sh @@ -53,9 +53,9 @@ echo -e "/*${GINKGO_LICENSE_BEACON}\n$(cat ${LICENSE_FILE})\n${GINKGO_LICENSE_BE # Does not work if a found file (including the path) contains a newline find "${GINKGO_ROOT_DIR}" \ - ! \( -name "build" -prune -o -name "third_party" -prune -o -name "external-lib-interfacing.cpp" -prune \) \ - \( -name '*.cuh' -o -name '*.hpp' -o -name '*.hpp.in' -o -name '*.cpp' -o -name '*.cu' \) \ + \( -name '*.cuh' -o -name '*.hpp' -o -name '*.hpp.in' -o -name '*.cpp' -o -name '*.cu' -o -name '*.hpp.inc' \) \ -type f -print \ + | grep -F -v -f "${THIS_DIR}/add_license.ignore" \ | \ while IFS='' read -r i; do # `grep -F` is important here because the characters in the beacon should be matched against diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config new file mode 100644 index 00000000000..768f3693327 --- /dev/null +++ b/dev_tools/scripts/config @@ -0,0 +1,40 @@ +- "test_install|benchmark" + - FixInclude: "ginkgo/ginkgo.hpp" +- "executor" + - FixInclude: "ginkgo/core/base/executor.hpp" +- "hip/base/config.hip.hpp" + - FixInclude: "hip/hip_runtime.h" +- "(cuda|hip|omp)/test/factorization/par_ilu_kernels" + - FixInclude: "core/factorization/par_ilu_kernels.hpp" +- "(cuda|hip)/preconditioner/jacobi_" + - FixInclude: "core/preconditioner/jacobi_kernels.hpp" +- "core/test/base/(extended_float|iterator_factory)" + - RemoveTest: "true" +- "_builder\.cpp" + - RemoveTest: "true" +- "_builder\.hpp" + - CoreSuffix: "_builder" +- "components.*_kernels(\.hip)?\.(cu|cpp|hpp|cuh)" + - CoreSuffix: "_kernels" + - RemoveTest: "true" +- "components" + - RemoveTest: "true" + - PathIgnore: "1" + - PathPrefix: "core" +- "test/utils" + - CoreSuffix: "_test" + - PathIgnore: "1" + - PathPrefix: "core" +- "core\/.*" + - CoreSuffix: "_kernels" + - PathPrefix: "ginkgo" + - PathIgnore: "0" + - RemoveTest: "true" +- "/(test|base)/" + - CoreSuffix: "_kernels" + - PathPrefix: "ginkgo/core" + - PathIgnore: "1" + - RemoveTest: "true" +- ".*" + - PathPrefix: "core" + - PathIgnore: "1" diff --git a/dev_tools/scripts/create_new_algorithm.sh b/dev_tools/scripts/create_new_algorithm.sh index 24b37b475fe..f6893f68c82 100755 --- a/dev_tools/scripts/create_new_algorithm.sh +++ b/dev_tools/scripts/create_new_algorithm.sh @@ -97,11 +97,13 @@ TEMPLATE_FILES=( "${name}_kernels.hpp" "${name}_kernels.cpp" "${name}_kernels.cpp" - "${name}_kernels.c*" + "${name}_*.[ch]*" + "${name}_kernels.hip.cpp" "${name}.cpp" "${name}_kernels.cpp" "${name}_kernels.cpp" "${name}_kernels.cpp" + "${name}_kernels.*" ) CMAKE_FILES=( "core/CMakeLists.txt" @@ -110,10 +112,12 @@ CMAKE_FILES=( "reference/CMakeLists.txt" "omp/CMakeLists.txt" "cuda/CMakeLists.txt" + "hip/CMakeLists.txt" "core/test/$source_type/CMakeLists.txt" "reference/test/$source_type/CMakeLists.txt" "omp/test/$source_type/CMakeLists.txt" "cuda/test/$source_type/CMakeLists.txt" + "hip/test/$source_type/CMakeLists.txt" ) TEMPLATE_FILES_LOCATIONS=( "core/$source_type" @@ -122,22 +126,26 @@ TEMPLATE_FILES_LOCATIONS=( "reference/$source_type" "omp/$source_type" "cuda/$source_type" + "hip/$source_type" "core/test/$source_type" "reference/test/$source_type" "omp/test/$source_type" "cuda/test/$source_type" + "hip/test/$source_type" ) TEMPLATE_FILES_TYPES=( "$source_type file" "class header" "kernel header" - "kernel file" - "kernel file" - "kernel file" - "unit tests for ${name} $type" + "Reference kernel file" + "OpenMP kernel file" + "CUDA kernel file" + "HIP kernel file" + "unit tests for ${name} $source_type" "unit tests for ${name} reference kernels" "unit tests for ${name} OMP kernels" "unit tests for ${name} CUDA kernels" + "unit tests for ${name} HIP kernels" ) TEMPLATE_FILES_DESCRIPTIONS=( "This is where the ${name} algorithm needs to be implemented." @@ -146,131 +154,140 @@ TEMPLATE_FILES_DESCRIPTIONS=( "Reference kernels for ${name} need to be implemented here." "OMP kernels for ${name} need to be implemented here." "CUDA kernels for ${name} need to be implemented here." - "" - "" - "" - "" + "HIP kernels for ${name} need to be implemented here." + "This is where core related unit tests should be implemented, i.e. relating to the interface without executor usage." + "This is where tests with the Reference executor should be implemented. Usually, this means comparing against previously known values." + "This is where tests with the OpenMP executor should be implemented. Usually, this means comparing against a Reference execution." + "This is where tests with the CUDA executor should be implemented. Usually, this means comparing against a Reference execution." + "This is where tests with the HIP executor should be implemented. Usually, this means comparing against a Reference execution." ) mkdir ${TMPDIR} -# create folder for temporary files - -# copy files needed into temporary folder for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ )) do sourcename=$(echo ${TEMPLATE_FILES[$i-1]} | sed "s/${name}/${source_name}/" ) sourcepath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${sourcename} - file=$(ls ${GINKGO_ROOT_DIR}/${sourcepath}) - if [ -f "$file" ] - then - # We have evaluated the extension and found it - # Integrate it in the template list - filename=$(basename -- ${file}) - source_path=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${filename} - TEMPLATE_FILES[$i-1]=$(echo "${filename}" | sed "s/${source_name}/${name}/") - else - echo "Warning: Source file $sourcepath was not found." - fi - destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} + # create folder for temporary files mkdir -p ${TMPDIR}/${TEMPLATE_FILES_LOCATIONS[$i-1]} - cp ${GINKGO_ROOT_DIR}/$sourcepath ${TMPDIR}/$destpath -done -# search and replace keywords with new solver name -echo -e "\nCreating temporary files:" -for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ )) -do - destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} - perl -pi -e "s/${source_name}/$name/g" ${TMPDIR}/$destpath - perl -pi -e "s/${source_name^}/$Name/g" ${TMPDIR}/$destpath - perl -pi -e "s/${source_name^^}/$NAME/g" ${TMPDIR}/$destpath + # Evaluate the extension and try to find the matching files + for j in $(ls ${GINKGO_ROOT_DIR}/${sourcepath}) + do + if [ -f "$j" ] + then + filename=$(basename -- ${j}) + source_path=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${filename} + destname=$(echo "${filename}" | sed "s/${source_name}/${name}/") + destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/$destname + cp ${GINKGO_ROOT_DIR}/$source_path ${TMPDIR}/$destpath - # Comment all code - awk '/^{$/,/^}$/ { if ($0 == "{"){ print "GKO_NOT_IMPLEMENTED;"; print "//" $0; print "// TODO (script): change the code imported from '${source_type}'/'${source_name}' if needed"; next} else { print "//" $0; next }} 1' ${TMPDIR}/$destpath > tmp - mv tmp ${TMPDIR}/$destpath + # Replace all instances of source_name by the user's requested name + perl -n -i -e "print unless m/.*common.*${source_name}_kernels.hpp.inc.*/" ${TMPDIR}/$destpath + perl -pi -e "s/${source_name}/$name/g" ${TMPDIR}/$destpath + perl -pi -e "s/${source_name^}/$Name/g" ${TMPDIR}/$destpath + perl -pi -e "s/${source_name^^}/$NAME/g" ${TMPDIR}/$destpath - ls ${TMPDIR}/$destpath -done + # Comment all code + awk -v name=${name} '/^{$/,/^}$/ { if ($0 == "{"){ print "GKO_NOT_IMPLEMENTED;"; print "//" $0; print "// TODO (script:" name "): change the code imported from '${source_type}'/'${source_name}' if needed"; next} else { print "//" $0; next }} 1' ${TMPDIR}/$destpath > tmp + mv tmp ${TMPDIR}/$destpath -if [ $execute == 1 ] -then - echo -e "\nRenaming and distributing files" - # rename and distribute the files to the right location - # for each file, make sure it does not exist yet - for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ )) - do - sourcepath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} - destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} - if [ ! -f ${GINKGO_ROOT_DIR}/$destpath ]; then - cp ${TMPDIR}/${sourcepath} ${GINKGO_ROOT_DIR}/${destpath} + ls ${TMPDIR}/$destpath + + if [ $execute == 1 ] + then + if [ ! -f ${GINKGO_ROOT_DIR}/$destpath ]; then + cp ${TMPDIR}/${destpath} ${GINKGO_ROOT_DIR}/${destpath} + else + echo -e "Error: file ${GINKGO_ROOT_DIR}/$destpath exists" + echo -e "Remove file first if you want to replace it." + read -p "" + fi + fi else - echo -e "Error: file ${GINKGO_ROOT_DIR}/$destpath exists" - echo -e "Remove file first if you want to replace it." - read -p "" + echo "Warning: Source file $sourcepath was not found." fi done +done - - echo -e "cleaning up temporary files." - rm -rf ${TMPDIR} - - +if [ $execute == 1 ] +then if [ $automatic_additions -eq 1 ] then ## Try to automatically add the files to CMakeLists - echo -e "Modifiying CMakeLists.txt and common_kernels.inc.cpp" + echo -e "Modifying CMakeLists.txt and common_kernels.inc.cpp" for ((i=1; i<=${#CMAKE_FILES[@]}; i++)) do - destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} - if [ ! -f ${GINKGO_ROOT_DIR}/${destpath} ]; - then - continue - fi - - cmake_file="${GINKGO_ROOT_DIR}/${CMAKE_FILES[$i-1]}" - if [[ $cmake_file == *"test/"* ]] - then - insert=$(grep -E "\(${source_name}[_\)]{1}" $cmake_file | sed "s/$source_name/$name/") - echo "$insert" >> $cmake_file - cat $cmake_file | sort > tmp - mv tmp $cmake_file - elif [[ $cmake_file != "${GINKGO_ROOT_DIR}/" ]] - then - ## Works only if we have something of the form: - ##target_sources( - ## PRIVATE - ## - ## ... - ## ) - list=( $(awk '/^target_sources/,/ .*\)/ {if ( match($0, "target_sources") == 0 && match($0, "PRIVATE") == 0 ) { print $0 }}' $cmake_file) ) - last_elem=$((${#list[@]}-1)) - list[$last_elem]=$(echo ${list[$last_elem]} | tr -d ')') - list+=( "$source_type/${TEMPLATE_FILES[$i-1]}" ) - IFS=$'\n' sorted=($(sort <<<"${list[*]}")) - unset IFS - last_elem=$((${#sorted[@]}-1)) - sorted[$last_elem]=$(echo ${sorted[$last_elem]}")") - - ## find the correct position - insert_to=$(grep -n -m 1 "target_sources" $cmake_file | sed 's/:.*//') - insert_to=$((insert_to + 1)) # account for the "PRIVATE" - - ## clear up the CMakeList.txt - awk '/^target_sources/,/ .*\)/ {if (match($0, "target_sources") != 0 || match($0, "PRIVATE") != 0){ print $0 }; next}1' $cmake_file > tmp - - mytmp=`mktemp` - head -n$insert_to tmp > $mytmp - for line in "${sorted[@]}" - do - echo " $line" >> $mytmp - done - tail -n +$((insert_to+1)) tmp >> $mytmp - mv $mytmp tmp - mv tmp $cmake_file - fi + sourcepath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} + for j in $(ls ${GINKGO_ROOT_DIR}/${sourcepath}) + do + filename=$(basename -- $j) + shortname=$(echo $filename | cut -d"." -f1) + sourcename=$(echo ${shortname} | sed "s/${name}/${source_name}/" ) + if [[ ! -f ${j} || "${j}" == *".hpp" || "${j}" == *".cuh" ]]; + then + continue + fi + + cmake_file="${GINKGO_ROOT_DIR}/${CMAKE_FILES[$i-1]}" + if [[ $cmake_file == *"test/"* ]] + then + insert=$(grep -E "\(${sourcename}[_\)]{1}" $cmake_file | sed "s/$source_name/$name/") + echo "$insert" >> $cmake_file + cat $cmake_file | sort > tmp + mv tmp $cmake_file + elif [[ $cmake_file != "${GINKGO_ROOT_DIR}/" ]] + then + ## For most directories this works with something of the form: + ##target_sources( + ## PRIVATE + ## + ## ... + ## ) + ## For HIP: + ##set(GINKGO_HIP_SOURCES + ## + ## ... + ## ) + if [[ $cmake_file == *"hip/"* ]] + then + list=( $(awk '/^set\(GINKGO_HIP_SOURCES/,/ .*\)/ {if ( match($0, "GINKGO_HIP_SOURCES") == 0 ) { print $0 }}' $cmake_file) ) + else + list=( $(awk '/^target_sources/,/ .*\)/ {if ( match($0, "target_sources") == 0 && match($0, "PRIVATE") == 0 ) { print $0 }}' $cmake_file) ) + fi + + last_elem=$((${#list[@]}-1)) + list[$last_elem]=$(echo ${list[$last_elem]} | tr -d ')') + list+=( "$source_type/${filename}" ) + IFS=$'\n' sorted=($(sort <<<"${list[*]}")) + unset IFS + last_elem=$((${#sorted[@]}-1)) + sorted[$last_elem]=$(echo ${sorted[$last_elem]}")") + + ## find the correct position and clear up the CMakeList.txt + if [[ $cmake_file == *"hip/"* ]] + then + insert_to=$(grep -n -m 1 "GINKGO_HIP_SOURCES" $cmake_file | sed 's/:.*//') + awk '/^set\(GINKGO_HIP_SOURCES/,/ .*\)/ {if (match($0, "GINKGO_HIP_SOURCES") != 0 ){ print $0 }; next}1' $cmake_file > tmp + else + insert_to=$(grep -n -m 1 "target_sources" $cmake_file | sed 's/:.*//') + insert_to=$((insert_to + 1)) # account for the "PRIVATE" + awk '/^target_sources/,/ .*\)/ {if (match($0, "target_sources") != 0 || match($0, "PRIVATE") != 0){ print $0 }; next}1' $cmake_file > tmp + fi + + mytmp=`mktemp` + head -n$insert_to tmp > $mytmp + for line in "${sorted[@]}" + do + echo " $line" >> $mytmp + done + tail -n +$((insert_to+1)) tmp >> $mytmp + mv $mytmp tmp + mv tmp $cmake_file + fi + done done @@ -305,7 +322,7 @@ then mytmp=`mktemp` head -n$old_code_block_end $common_kernels_file > $mytmp - echo -e "\n\n// TODO (script): adapt this block as needed" >> $mytmp + echo -e "\n\n// TODO (script:${name}): adapt this block as needed" >> $mytmp for line in "${old_code_block[@]}" do echo -e "$line" | sed "s/${source_name^^}/$NAME/g" | sed "s/${source_name}/$name/g" >> $mytmp @@ -313,6 +330,9 @@ then tail -n +$((old_code_block_end+1)) $common_kernels_file >> $mytmp mv $mytmp $common_kernels_file fi + + echo -e "cleaning up temporary files." + rm -rf ${TMPDIR} else echo -e "\nNo file was copied because --dry-run was used" echo -e "You can inspect the generated solver files in ${TMPDIR}." @@ -322,7 +342,7 @@ if [ -f todo_${name}.txt ]; then rm todo_${name}.txt fi -echo -e "\nSummary:" | tee -a todo_${name}.txt +echo -e "\n###Summary:" | tee -a todo_${name}.txt for (( i=1; i<${#TEMPLATE_FILES[@]}+1; i++ )) do destpath=${TEMPLATE_FILES_LOCATIONS[$i-1]}/${TEMPLATE_FILES[$i-1]} @@ -342,38 +362,35 @@ then do if [[ "${CMAKE_FILES[$i-1]}" != "" ]] then - echo "Modified ${CMAKE_FILES[$i-1]}" | tee -a todo_${name}.txt + echo "Modified ${CMAKE_FILES[$i-1]}" | tee -a todo_${name}.txt fi done - echo "Modified core/device_hooks/common_kernels.inc.cpp" | tee -a todo_${name}.txt + echo "Modified core/device_hooks/common_kernels.inc.cpp" | tee -a todo_${name}.txt fi -echo -e "In all of the previous files ${source_name} was automatically replaced into ${name}. Ensure there is no inconsistency." | tee -a todo_${name}.txt -echo -e "" | tee -a todo_${name}.txt -echo -e "All the imported code was commented and TODO items were generated in the new files." | tee -a todo_${name}.txt -echo -e "Check all the modified files for '// TODO (script):' items"| tee -a todo_${name}.txt -echo -e "e.g. by using grep -HR '// TODO (script):' ${GINKGO_ROOT_DIR}"| tee -a todo_${name}.txt -echo "" | tee -a todo_${name}.txt - if [ $automatic_additions -eq 0 ] then - echo "" | tee -a todo_${name}.txt - echo "The following CMakeLists have to be modified manually:" | tee -a todo_${name}.txt - echo "core/CMakeLists.txt" | tee -a todo_${name}.txt - echo "core/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt - echo "" | tee -a todo_${name}.txt - echo "reference/CMakeLists.txt" | tee -a todo_${name}.txt - echo "reference/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt - echo "" | tee -a todo_${name}.txt - echo "omp/CMakeLists.txt" | tee -a todo_${name}.txt - echo "omp/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt - echo "" | tee -a todo_${name}.txt - echo "cuda/CMakeLists.txt" | tee -a todo_${name}.txt - echo "cuda/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt - echo "" | tee -a todo_${name}.txt - echo "" | tee -a todo_${name}.txt - echo "The following header file has to modified:" | tee -a todo_${name}.txt - echo "core/device_hooks/common_kernels.inc.cpp" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "The following CMakeLists have to be modified manually:"| tee -a todo_${name}.txt + echo "core/CMakeLists.txt" | tee -a todo_${name}.txt + echo "core/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "reference/CMakeLists.txt" | tee -a todo_${name}.txt + echo "reference/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "omp/CMakeLists.txt" | tee -a todo_${name}.txt + echo "omp/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "cuda/CMakeLists.txt" | tee -a todo_${name}.txt + echo "cuda/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "hip/CMakeLists.txt" | tee -a todo_${name}.txt + echo "hip/test/${source_type}/CMakeLists.txt" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "" | tee -a todo_${name}.txt + echo "The following header file has to be modified:" | tee -a todo_${name}.txt + echo "core/device_hooks/common_kernels.inc.cpp" | tee -a todo_${name}.txt echo "Equivalent to the other solvers, the following part has to be appended:" | tee -a todo_${name}.txt echo "##################################################" | tee -a todo_${name}.txt echo "#include #include \"core/solver/test_kernels.hpp\"" | tee -a todo_${name}.txt @@ -394,5 +411,17 @@ then echo "" | tee -a todo_${name}.txt echo "" | tee -a todo_${name}.txt fi + +echo -e "\n\n\n### TODO:" | tee -a todo_${name}.txt +echo -e "In all of the previous files ${source_name} was automatically replaced into ${name}. Ensure there is no inconsistency." | tee -a todo_${name}.txt +echo -e "" | tee -a todo_${name}.txt +echo -e "All the imported code was commented and TODO items were generated in the new files." | tee -a todo_${name}.txt +echo -e "Check all the modified files for \"// TODO (script:${name}):\" items"| tee -a todo_${name}.txt +echo -e "e.g. by using grep -nR \"// TODO (script:${name}):\" ${GINKGO_ROOT_DIR} | grep -v \"create_new_algorithm.sh\" | grep -v \"todo_${name}.txt\"." | tee -a todo_${name}.txt +echo "" | tee -a todo_${name}.txt +echo "A tentative list of relevant TODO items follows:" | tee -a todo_${name}.txt +grep -nR "// TODO (script:${name}):" ${GINKGO_ROOT_DIR} | grep -v "create_new_algorithm.sh" | grep -v "todo_${name}.txt" | tee -a todo_${name}.txt + + echo "A summary of the required next steps has been written to:" echo "todo_${name}.txt" diff --git a/dev_tools/scripts/cuda2hip.sh b/dev_tools/scripts/cuda2hip.sh new file mode 100755 index 00000000000..c372e0869b2 --- /dev/null +++ b/dev_tools/scripts/cuda2hip.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +HIPIFY=/opt/rocm/hip/bin/hipify-perl +# For some reasons, hipify from apt does not add HIP_KERNEL_NAME. + +if [ "$0" != "dev_tools/scripts/cuda2hip.sh" ]; then + echo "You are only allowed to run dev_tools/scripts/cuda2hip.sh in the ginkgo source folder." + exit 1 +fi + +if [ -z "$1" ]; then + echo "Usage: $0 path/to/cuda/file" + exit 2 +fi + +ORIGIN_FILE=$1 +echo "CUDA: ${ORIGIN_FILE}" +NEW_FILE=$(echo ${ORIGIN_FILE} | sed -E "s/^cuda/hip/g;s/(cuh|hpp)$/hip\.hpp/g;s/(cpp|cu)$/hip\.cpp/g") +echo "HIP: ${NEW_FILE}" +${HIPIFY} "${ORIGIN_FILE}" > "${NEW_FILE}" + +# String replacement +# header file +REG="s/(cuda[a-z\/_]*)(\.hpp|\.cuh)/\1.hip.hpp/g" +# cuda -> hip +REG="${REG};s/cuda/hip/g;s/Cuda/Hip/g;s/CUDA/HIP/g" +# cublas -> hipblas +REG="${REG};s/cublas/hipblas/g;s/Cublas/Hipblas/g;s/CUBLAS/HIPBLAS/g" +# cusparse -> hipsparse +REG="${REG};s/cusparse/hipsparse/g;s/Cusparse/Hipsparse/g;s/CUSPARSE/HIPSPARSE/g" +# culibs -> hiplibs +REG="${REG};s/culibs/hiplibs/g" +# header definition +REG="${REG};s/(CUH_|HPP_)$/HIP_HPP_/g" + +sed -i -E "${REG}" "${NEW_FILE}" + +# Move the namespace into correct place. +# {namespace}::hipLaunchKernelGGL( to hipLaunchKernelGGL({namespace}:: +sed -i -E "s/(.*)::hipLaunchKernelGGL\(/hipLaunchKernelGGL\(\1::/g" "${NEW_FILE}" +# {namespace}::HIP_KERNEL_NAME( to HIP_KERNEL_NAME({namespace}:: +sed -i -E "s/(.*)::HIP_KERNEL_NAME\(/HIP_KERNEL_NAME\(\1::/g" "${NEW_FILE}" diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh new file mode 100755 index 00000000000..21b92419ccd --- /dev/null +++ b/dev_tools/scripts/format_header.sh @@ -0,0 +1,313 @@ +#!/usr/bin/env bash + +convert_header () { + local regex="^(#include )(<|\")(.*)(\"|>)$" + if [[ $@ =~ ${regex} ]]; then + header_file="${BASH_REMATCH[3]}" + if [ -f "${header_file}" ]; then + if [[ "${header_file}" =~ ^ginkgo ]]; then + echo "#include <${header_file}>" + else + echo "#include \"${header_file}\"" + fi + elif [ "${header_file}" = "matrices/config.hpp" ]; then + echo "#include \"${header_file}\"" + else + echo "#include <${header_file}>" + fi + else + echo "$@" + fi +} + +get_header_def () { + local regex="\.(hpp|cuh)" + if [[ $@ =~ $regex ]]; then + local def=$(echo "$@" | sed -E "s~include/ginkgo/~~g;s~/|\.~_~g") + def=$(echo GKO_${def^^}_) + echo $def + else + echo "" + fi +} + +add_regroup () { + cp .clang-format .clang-format.temp + sed -i "s~\.\.\.~~g" .clang-format + cat dev_tools/scripts/regroup >> .clang-format + echo "..." >> .clang-format +} + +remove_regroup () { + mv .clang-format.temp .clang-format +} + +# It reads "dev_tools/scripts/config" to generate the corresponding main header +# The setting setting: +# - "file_regex" +# - CoreSuffix: "core_suffix_regex" (default "") +# - PathPrefix: "path_prefix_regex" (default "") +# - PathIgnore: "path_ignore_number" (default "0") +# - RemoveTest: "false/true" (default "test") +# - FixInclude: "the specific main header" (default "") +# Only "file_regex" without any setting is fine, and it means find the same name with header suffix +# For example, /path/to/file.cpp will change to /path/to/file.hpp +# file_regex : selecting which file apply this rule +# CoreSuffix : remove the pattern which passes the "core_suffix_regex" of file +# PathPrefix : adds "path_prefix_regex" before path, and the position depends on PathIgnore +# PathIgnore : ignore the number "path_ignore_number" folder from top level, and then add "path_prefix_regex" into path +# RemoveTest : Decide whether ignore /test/ in the path +# FixInclude : Specify the main header. If it is set, ignore others setting +# Note: This script picks the first fitting "file_regex" rules according the ordering in config +get_include_regex () { + local file="$1" + declare -n local_output=$2 + local core_suffix="" + local path_prefix="" + local path_ignore="0" + local fix_include="" + local remove_test="false" + local item_regex="^-\ +\"(.*)\"" + local path_prefix_regex="PathPrefix:\ +\"(.*)\"" + local core_suffix_regex="CoreSuffix:\ +\"(.*)\"" + local path_ignore_regex="PathIgnore:\ +\"(.*)\"" + local fix_include_regex="FixInclude:\ +\"(.*)\"" + local remove_test_regex="RemoveTest:\ +\"(.*)\"" + local match="false" + while IFS='' read -r line; do + if [[ "$line" =~ $item_regex ]]; then + file_regex="${BASH_REMATCH[1]}" + if [[ "$match" = "true" ]]; then + break + elif [[ $file =~ $file_regex ]]; then + match="true" + fi + elif [ "$match" = "true" ]; then + if [[ "$line" =~ $path_prefix_regex ]]; then + path_prefix="${BASH_REMATCH[1]}" + elif [[ "$line" =~ $core_suffix_regex ]]; then + core_suffix="${BASH_REMATCH[1]}" + elif [[ "$line" =~ $path_ignore_regex ]]; then + path_ignore="${BASH_REMATCH[1]}" + elif [[ "$line" =~ $fix_include_regex ]]; then + fix_include="${BASH_REMATCH[1]}" + elif [[ "$line" =~ $remove_test_regex ]]; then + remove_test="${BASH_REMATCH[1]}" + else + echo "Ignore unknow setting: \"${file_regex}\" - ${line}" + fi + fi + done < "dev_tools/scripts/config" + local_output="" + if [ -z "${fix_include}" ]; then + local path_regex="([a-zA-Z_]*\/){${path_ignore}}(.*)\.(cpp|hpp|cu|cuh)" + if [ ! -z "${path_prefix}" ]; then + path_prefix="${path_prefix}/" + fi + local_output=$(echo "${file}" | sed -E "s~\.hip~~g;s~$path_regex~$path_prefix\2~g") + local_output=$(echo "${local_output}" | sed -E "s~$core_suffix$~~g") + local_output="#include (<|\")$local_output\.(hpp|hip\.hpp|cuh)(\"|>)" + if [ "${remove_test}" = "true" ]; then + local_output=$(echo "${local_output}" | sed -E "s~test/~~g") + fi + else + local_output="#include (<|\")$fix_include(\"|>)" + fi +} + +GINKGO_LICENSE_BEACON="************************************************************" + +CONTENT="content.cpp" # Store the residual part (start from namespace) +BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file +HAS_HIP_RUNTIME="false" +DURING_LICENSE="false" +INCLUDE_REGEX="^#include.*" +INCLUDE_INC="\.inc" +MAIN_PART_MATCH="" + +# FORCE_TOP_ON/OFF is only valid before other #include +FORCE_TOP_ON="// force-top: on" +FORCE_TOP_OFF="// force-top: off" +FORCE_TOP="force_top" +DURING_FORCE_TOP="false" + +get_include_regex $1 MAIN_PART_MATCH +HEADER_DEF=$(get_header_def $1) + +IFNDEF="" +DEFINE="" +IFNDEF_REGEX="^#ifndef GKO_" +DEFINE_REGEX="^#define GKO_" +HEADER_REGEX="\.(hpp|cuh)" +SKIP="true" +START_BLOCK_REX="^(#if| *\/\*)" +END_BLOCK_REX="^#endif|\*\/$" +ENDIF_REX="^#endif" +IN_BLOCK=0 +KEEP_LINES=0 +LAST_NONEMPTY="" +ALARM="" +COMMENT_REGEX="^ *\/\/" +CONSIDER_REGEX="${START_BLOCK_REX}|${END_BLOCK_REX}|${COMMENT_REGEX}|${INCLUDE_REGEX}" + +# This part capture the main header and give the possible fail arrangement information +while IFS='' read -r line || [ -n "$line" ]; do + if [ "${line}" = '#include "hip/hip_runtime.h"' ] && [ "${SKIP}" = "true" ]; then + HAS_HIP_RUNTIME="true" + elif [ "${line}" = "/*${GINKGO_LICENSE_BEACON}" ] || [ "${DURING_LICENSE}" = "true" ]; then + DURING_LICENSE="true" + if [ "${line}" = "${GINKGO_LICENSE_BEACON}*/" ]; then + DURING_LICENSE="false" + fi + elif [ "${SKIP}" = "true" ] && ([ "$line" = "${FORCE_TOP_ON}" ] || [ "${DURING_FORCE_TOP}" = "true" ]); then + DURING_FORCE_TOP="true" + if [ "$line" = "${FORCE_TOP_OFF}" ]; then + DURING_FORCE_TOP="false" + fi + if [[ "${line}" =~ $INCLUDE_REGEX ]]; then + line="$(convert_header ${line})" + fi + echo "$line" >> "${FORCE_TOP}" + elif [ -z "${line}" ] && [ "${SKIP}" = "true" ]; then + # Ignore all empty lines between LICENSE and Header + : + else + if [[ "${line}" =~ $INCLUDE_REGEX ]]; then + line="$(convert_header ${line})" + fi + if [ -z "${line}" ]; then + KEEP_LINES=$((KEEP_LINES+1)) + else + LAST_NONEMPTY="${line}" + KEEP_LINES=0 + fi + if [[ $1 =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${IFNDEF_REGEX} ]] && [ "${SKIP}" = "true" ] && [ -z "${DEFINE}" ]; then + IFNDEF="${line}" + elif [[ $1 =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${DEFINE_REGEX} ]] && [ "${SKIP}" = "true" ] && [ ! -z "${IFNDEF}" ]; then + DEFINE="${line}" + elif [ -z "${MAIN_PART_MATCH}" ] || [[ ! "${line}" =~ ${MAIN_PART_MATCH} ]] || [[ "${IN_BLOCK}" -gt 0 ]]; then + echo "${line}" >> "${CONTENT}" + SKIP="false" + if [[ "${line}" =~ $START_BLOCK_REX ]]; then + # keep everythin in #if block and /* block + IN_BLOCK=$((IN_BLOCK+1)) + if [ -z "${ALARM}" ]; then + ALARM="set" + fi + fi + if [[ "${IN_BLOCK}" = "0" ]] && [ ! -z "${line}" ] && [[ ! "${line}" =~ ${CONSIDER_REGEX} ]]; then + if [ "${ALARM}" = "set" ]; then + ALARM="true" + elif [ -z "${ALARM}" ]; then + ALARM="false" + fi + fi + if [[ "${line}" =~ $END_BLOCK_REX ]]; then + IN_BLOCK=$((IN_BLOCK-1)) + fi + else + echo "${line}" >> ${BEFORE} + fi + fi +done < $1 +if [ "${ALARM}" = "true" ]; then + echo "Warning $1: sorting is probably incorrect" +fi + +# Wrtie license +echo "/*${GINKGO_LICENSE_BEACON}" > $1 +cat LICENSE >> $1 +echo "${GINKGO_LICENSE_BEACON}*/" >> $1 +echo "" >> $1 + +# Wrtie the definition of header according to path +if [ ! -z "${IFNDEF}" ] && [ ! -z "${DEFINE}" ]; then + IFNDEF="#ifndef ${HEADER_DEF}" + DEFINE="#define ${HEADER_DEF}" +elif [ -z "${IFNDEF}" ] && [ -z "${DEFINE}" ]; then + : +else + echo "Warning $1: only #ifndef GKO_ or #define GKO_ is in the header" +fi +if [ ! -z "${IFNDEF}" ]; then + echo "${IFNDEF}" >> $1 +fi +if [ ! -z "${DEFINE}" ]; then + echo "${DEFINE}" >> $1 + echo "" >> $1 + echo "" >> $1 +fi + +# Write the force-top header +if [ -f "${FORCE_TOP}" ]; then + cat "${FORCE_TOP}" >> $1 + echo "" >> $1 + echo "" >> $1 + rm "${FORCE_TOP}" +fi + +# Write the main header and give warnning if there are multiple matches +if [ -f "${BEFORE}" ]; then + # sort or remove the duplication + clang-format -i -style=file ${BEFORE} + if [ $(wc -l < ${BEFORE}) -gt "1" ]; then + echo "Warning $1: there are multiple main header matchings" + fi + cat ${BEFORE} >> $1 + if [ -f "${CONTENT}" ]; then + echo "" >> $1 + echo "" >> $1 + fi + rm "${BEFORE}" +fi + +# Arrange the remain files and give +if [ -f "${CONTENT}" ]; then + add_regroup + if [ "${HAS_HIP_RUNTIME}" = "true" ]; then + echo "#include " > temp + fi + head -n -${KEEP_LINES} ${CONTENT} >> temp + if [ ! -z "${IFNDEF}" ] && [ ! -z "${DEFINE}" ]; then + # Ignore the last line #endif + if [[ "${LAST_NONEMPTY}" =~ $ENDIF_REX ]]; then + head -n -1 temp > ${CONTENT} + echo "#endif // $HEADER_DEF" >> ${CONTENT} + else + echo "Warning $1: Found the begin header_def but did not find the end of header_def" + cat temp > ${CONTENT} + fi + else + cat temp > "${CONTENT}" + fi + clang-format -i -style=file "${CONTENT}" + rm temp + remove_regroup + PREV_INC=0 + IN_IF="false" + SKIP="true" + while IFS='' read -r line; do + # Skip the empty line in the beginning + if [ "${SKIP}" = "true" ] && [[ -z "${line}" ]]; then + continue + else + SKIP="false" + fi + # Insert content with correct number empty lines + if [[ ${line} =~ ${INCLUDE_REGEX} ]] && [[ ! ${line} =~ ${INCLUDE_INC} ]]; then + if [[ ${PREV_INC} == 1 ]]; then + echo "" >> $1 + fi + PREV_INC=0 + else + if [ -z "${line}" ]; then + PREV_INC=$((PREV_INC+1)) + else + # To keep the original lines + PREV_INC=-3 + fi + fi + echo "${line}" >> $1 + done < "${CONTENT}" + rm "${CONTENT}" +fi diff --git a/dev_tools/scripts/gdb-ginkgo.py b/dev_tools/scripts/gdb-ginkgo.py new file mode 100644 index 00000000000..f20f0597b31 --- /dev/null +++ b/dev_tools/scripts/gdb-ginkgo.py @@ -0,0 +1,130 @@ +# Pretty-printers for Ginkgo +# Based on the pretty-printers for libstdc++. + +# Copyright (C) 2008-2020 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import gdb +import itertools +import sys +import re + +if sys.version_info[0] > 2: + ### Python 3 stuff + Iterator = object + # Python 3 folds these into the normal functions. + imap = map + izip = zip + # Also, int subsumes long + long = int +else: + ### Python 2 stuff + class Iterator: + """Compatibility mixin for iterators + + Instead of writing next() methods for iterators, write + __next__() methods and use this mixin to make them work in + Python 2 as well as Python 3. + + Idea stolen from the "six" documentation: + + """ + + def next(self): + return self.__next__() + + # In Python 2, we still need these from itertools + from itertools import imap, izip + +_versioned_namespace = '__8::' + +def is_specialization_of(x, template_name): + "Test if a type is a given template instantiation." + global _versioned_namespace + if type(x) is gdb.Type: + x = x.tag + if _versioned_namespace: + return re.match('^std::(%s)?%s<.*>$' % (_versioned_namespace, template_name), x) is not None + return re.match('^std::%s<.*>$' % template_name, x) is not None + + +def get_unique_ptr_data_ptr(val): + impl_type = val.type.fields()[0].type.tag + # Check for new implementations first: + if is_specialization_of(impl_type, '__uniq_ptr_data') \ + or is_specialization_of(impl_type, '__uniq_ptr_impl'): + tuple_member = val['_M_t']['_M_t'] + elif is_specialization_of(impl_type, 'tuple'): + tuple_member = val['_M_t'] + else: + raise ValueError("Unsupported implementation for unique_ptr: %s" % impl_type) + tuple_impl_type = tuple_member.type.fields()[0].type # _Tuple_impl + tuple_head_type = tuple_impl_type.fields()[1].type # _Head_base + head_field = tuple_head_type.fields()[0] + if head_field.name == '_M_head_impl': + return tuple_member['_M_head_impl'] + elif head_field.is_base_class: + return tuple_member.cast(head_field.type) + else: + raise ValueError("Unsupported implementation for tuple in unique_ptr: %s" % impl_type) + + +class GkoArrayPrinter: + "Print a gko::Array" + + class _iterator(Iterator): + def __init__ (self, start, size): + self.item = start + self.size = size + self.count = 0 + + def __iter__(self): + return self + + def __next__(self): + count = self.count + self.count = self.count + 1 + if self.count > self.size: + raise StopIteration + elt = self.item.dereference() + self.item = self.item + 1 + return ('[%d]' % count, elt) + + def __init__(self, val): + self.val = val + self.execname = str(self.val['exec_']['_M_ptr'].dereference().dynamic_type) + self.pointer = get_unique_ptr_data_ptr(self.val['data_']); + self.is_cpu = re.match('gko::(Reference|Omp)Executor', str(self.execname)) is not None + + def children(self): + if self.is_cpu: + return self._iterator(self.pointer, self.val['num_elems_']) + return [] + + def to_string(self): + return ('%s of length %d on %s (%s)' % (str(self.val.type), int(self.val['num_elems_']), self.execname, self.pointer)) + + def display_hint(self): + return 'array' + +def lookup_type(val): + if not str(val.type).startswith('gko::'): + return None + suffix = str(val.type)[5:] + if suffix.startswith('Array'): + return GkoArrayPrinter(val) + return None + +gdb.pretty_printers.append(lookup_type) diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup new file mode 100644 index 00000000000..036d5d81588 --- /dev/null +++ b/dev_tools/scripts/regroup @@ -0,0 +1,12 @@ +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<(rapidjson|gflags|gtest|papi).*' + Priority: 3 + - Regex: '^<(omp|cu|hip|thrust).*' + Priority: 2 + - Regex: '^ "${HEADER_LIST}" +find "${TOP_HEADER_FOLDER}" -name '*.hpp' -type f -print | \ + grep -v 'residual_norm_reduction.hpp' > "${HEADER_LIST}" if [ ${?} -ne 0 ]; then echo "${WARNING_PREFIX} "'The `find` command returned with an error!' 1>&2 @@ -76,7 +77,7 @@ if [[ "$(file "${GINKGO_HEADER_TEMPLATE_FILE}")" == *"CRLF"* ]]; then fi # Generate a new, temporary ginkgo header file. -# It will get compared at the end to the existing file in order to prevent +# It will get compared at the end to the existing file in order to prevent # the rebuilding of targets which depend on the global header # (e.g. benchmarks and examples) GINKGO_HEADER_TMP="${GINKGO_HEADER_FILE}.tmp" @@ -109,12 +110,12 @@ while IFS='' read -r line; do while IFS='' read -r prefixed_file; do # Remove the include directory from the file name file="${prefixed_file#${TOP_HEADER_FOLDER}/}" - + # Do not include yourself if [ "${file}" == "${GINKGO_HEADER_FILE}" ]; then continue fi - + CURRENT_FOLDER="$(dirname ${file})" # add newline between different include folder if [ "${READING_FIRST_LINE}" != true ] && \ diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 2c624dc4fd0..d416149638a 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -6,7 +6,7 @@ option(GINKGO_DOC_GENERATE_PDF "Generate PDF documentation" OFF) option(GINKGO_DOC_GENERATE_DEV "Generate internal documentation" OFF) option(GINKGO_DOC_GENERATE_EXAMPLES "Generate example documentation" ON) if(GINKGO_DOC_GENERATE_EXAMPLES) - add_subdirectory(examples) + add_subdirectory(examples) endif() if (GINKGO_DOC_GENERATE_PDF) diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml index 4c25a288a38..268f8348145 100644 --- a/doc/DoxygenLayout.xml +++ b/doc/DoxygenLayout.xml @@ -5,6 +5,8 @@ + + diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt index be0224278da..f227048dad8 100644 --- a/doc/examples/CMakeLists.txt +++ b/doc/examples/CMakeLists.txt @@ -1,41 +1,41 @@ # Collect all of the directory names for the examples programs FILE(GLOB _ginkgo_examples - ${CMAKE_SOURCE_DIR}/examples/* - ) + ${CMAKE_SOURCE_DIR}/examples/* + ) LIST(REMOVE_ITEM _ginkgo_examples "${CMAKE_SOURCE_DIR}/examples/CMakeLists.txt") ADD_CUSTOM_TARGET(examples) file(GLOB _ginkgo_examples_tooltip - ${CMAKE_SOURCE_DIR}/examples/*/doc/tooltip - ) + ${CMAKE_SOURCE_DIR}/examples/*/doc/tooltip + ) file(GLOB _ginkgo_examples_kind - ${CMAKE_SOURCE_DIR}/examples/*/doc/kind - ) + ${CMAKE_SOURCE_DIR}/examples/*/doc/kind + ) file(GLOB _ginkgo_examples_buildson - ${CMAKE_SOURCE_DIR}/examples/*/doc/builds-on - ) + ${CMAKE_SOURCE_DIR}/examples/*/doc/builds-on + ) ADD_CUSTOM_COMMAND( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp - COMMAND ${PERL_EXECUTABLE} - ARGS - ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl - ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in - ${_ginkgo_examples} - > ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp - DEPENDS - ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl - ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in - ${_ginkgo_examples_tooltip} - ${_ginkgo_examples_kind} - ${_ginkgo_examples_buildson} - ) + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp + COMMAND ${PERL_EXECUTABLE} + ARGS + ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl + ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in + ${_ginkgo_examples} + > ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp + DEPENDS + ${CMAKE_SOURCE_DIR}/doc/scripts/examples.pl + ${CMAKE_CURRENT_SOURCE_DIR}/examples.hpp.in + ${_ginkgo_examples_tooltip} + ${_ginkgo_examples_kind} + ${_ginkgo_examples_buildson} + ) ADD_CUSTOM_TARGET(build_examples_hpp - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp - COMMENT - "Building examples.hpp") + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/examples.hpp + COMMENT + "Building examples.hpp") ADD_DEPENDENCIES(examples build_examples_hpp) # @@ -43,46 +43,46 @@ ADD_DEPENDENCIES(examples build_examples_hpp) # FOREACH(example ${_ginkgo_examples}) - GET_FILENAME_COMPONENT(example "${example}" NAME) + GET_FILENAME_COMPONENT(example "${example}" NAME) - ADD_CUSTOM_COMMAND( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp - COMMAND ${PERL_EXECUTABLE} - ARGS - ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain - < ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp - > ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp - DEPENDS - ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain - ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp - VERBATIM - ) + ADD_CUSTOM_COMMAND( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp + COMMAND ${PERL_EXECUTABLE} + ARGS + ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain + < ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp + > ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp + DEPENDS + ${CMAKE_SOURCE_DIR}/doc/scripts/program2plain + ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp + VERBATIM + ) - ADD_CUSTOM_COMMAND( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp - COMMAND ${PERL_EXECUTABLE} - ARGS - ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl - ${example} ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} - > ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl - ${CMAKE_SOURCE_DIR}/doc/scripts/intro2toc - ${CMAKE_SOURCE_DIR}/doc/scripts/create_anchors - ${CMAKE_SOURCE_DIR}/doc/scripts/program2doxygen - ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp - ${CMAKE_SOURCE_DIR}/examples/${example}/doc/intro.dox - ${CMAKE_SOURCE_DIR}/examples/${example}/doc/results.dox - ) + ADD_CUSTOM_COMMAND( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp + COMMAND ${PERL_EXECUTABLE} + ARGS + ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl + ${example} ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} + > ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS + ${CMAKE_SOURCE_DIR}/doc/scripts/make_example.pl + ${CMAKE_SOURCE_DIR}/doc/scripts/intro2toc + ${CMAKE_SOURCE_DIR}/doc/scripts/create_anchors + ${CMAKE_SOURCE_DIR}/doc/scripts/program2doxygen + ${CMAKE_SOURCE_DIR}/examples/${example}/${example}.cpp + ${CMAKE_SOURCE_DIR}/examples/${example}/doc/intro.dox + ${CMAKE_SOURCE_DIR}/examples/${example}/doc/results.dox + ) - ADD_CUSTOM_TARGET(examples_${example} - DEPENDS - ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp - ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp - COMMENT + ADD_CUSTOM_TARGET(examples_${example} + DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/${example}.hpp + ${CMAKE_CURRENT_BINARY_DIR}/${example}.cpp + COMMENT "Building doxygen input file for examples program <${example}>" - ) - ADD_DEPENDENCIES(examples examples_${example}) + ) + ADD_DEPENDENCIES(examples examples_${example}) ENDFOREACH() diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in index 6dcff03ab3c..cf0b01f9a04 100644 --- a/doc/examples/examples.hpp.in +++ b/doc/examples/examples.hpp.in @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -128,8 +128,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * @ref twentyseven_pt_stencil_solver - * Using a twentyseven point 3D stencil to solve the poisson equation with - * array views. + * Using a twentyseven point 3D stencil to solve the poisson equation + * with array views. * * * diff --git a/doc/headers/cuda_executor.hpp b/doc/headers/cuda_executor.hpp index d42eb6bc197..52b9307357a 100644 --- a/doc/headers/cuda_executor.hpp +++ b/doc/headers/cuda_executor.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/executors.hpp b/doc/headers/executors.hpp index d12e9c804da..002f64230e9 100644 --- a/doc/headers/executors.hpp +++ b/doc/headers/executors.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -51,6 +51,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * CPU); * + @ref exec_cuda specifies that the data should be stored and the * operations executed on the NVIDIA GPU accelerator; + * + @ref exec_hip uses the HIP library to compile code for either NVIDIA or + * AMD GPU accelerator; * + @ref exec_ref executes a non-optimized reference implementation, * which can be used to debug the library. */ diff --git a/doc/headers/factor.hpp b/doc/headers/factor.hpp index 324f9657235..320668cabae 100644 --- a/doc/headers/factor.hpp +++ b/doc/headers/factor.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/hip_executor.hpp b/doc/headers/hip_executor.hpp new file mode 100644 index 00000000000..4805cfb4b87 --- /dev/null +++ b/doc/headers/hip_executor.hpp @@ -0,0 +1,40 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @defgroup exec_hip HIP Executor + * + * @brief A module dedicated to the implementation and usage of the HIP + * executor in Ginkgo. + * + * @ingroup Executor + */ diff --git a/doc/headers/jacobi.hpp b/doc/headers/jacobi.hpp index 8e406d75fea..875efa2c4d0 100644 --- a/doc/headers/jacobi.hpp +++ b/doc/headers/jacobi.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/linop.hpp b/doc/headers/linop.hpp index e208fd9391c..12fc582eb9d 100644 --- a/doc/headers/linop.hpp +++ b/doc/headers/linop.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/logging.hpp b/doc/headers/logging.hpp index e30edb61e72..e9563469b87 100644 --- a/doc/headers/logging.hpp +++ b/doc/headers/logging.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/matrix_formats.hpp b/doc/headers/matrix_formats.hpp index b085eba5bc8..641cb98bc13 100644 --- a/doc/headers/matrix_formats.hpp +++ b/doc/headers/matrix_formats.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/modules.dot b/doc/headers/modules.dot deleted file mode 100644 index 6978fdf670b..00000000000 --- a/doc/headers/modules.dot +++ /dev/null @@ -1,37 +0,0 @@ - digraph G -{ - graph[rankdir="TB",bgcolor="transparent"]; - - edge [fontname="Times-Roman",fontsize=15,labelfontname="Times-Roman",labelfontsize=14]; - node [fontname="Times-Roman",fontsize=15, - shape=record,height=0.2,width=0.4, - color="black", fillcolor="white", style="filled"]; - - exec [label="Executors",URL="\ref Executor"]; - omp [label="OpenMP Executor",URL="\ref exec_omp "]; - ref [label="Reference Executor",URL="\ref exec_ref"]; - cuda [label="CUDA Executor",URL="\ref exec_cuda"]; - linop [label="Linear Operators",URL="\ref LinOp"]; - solvers [label="Solvers",URL="\ref solvers"]; - precond [label="Preconditioners",URL="\ref precond"]; - factor [label="Factorizations",URL="\ref factor"]; - matformat [label="Matrix Formats",URL="\ref mat_formats "]; - // log [label="Logging",URL="\ref log"]; - stop [label="Stopping Criteria",URL="\ref stop"]; - - // log -> exec [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - exec -> ref [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - exec -> cuda[color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - exec -> omp [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - omp -> linop [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - cuda -> linop [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - ref -> linop [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - omp -> stop [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - cuda -> stop [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - ref -> stop [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - linop -> matformat [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - linop -> solvers [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - linop -> precond [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - linop -> factor [color="black",fontsize=14,style="solid",fontname="Times-Roman"]; - stop -> solvers [color="black",fontsize=14,style="dashed",fontname="Times-Roman"]; -} diff --git a/doc/headers/omp_executor.hpp b/doc/headers/omp_executor.hpp index 0f6c5e7e140..83df1f5b292 100644 --- a/doc/headers/omp_executor.hpp +++ b/doc/headers/omp_executor.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/preconditioners.hpp b/doc/headers/preconditioners.hpp index 1da0d147f4b..b7797b92d1b 100644 --- a/doc/headers/preconditioners.hpp +++ b/doc/headers/preconditioners.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/ref_executor.hpp b/doc/headers/ref_executor.hpp index dd7b4ea0940..c4faf61e2c4 100644 --- a/doc/headers/ref_executor.hpp +++ b/doc/headers/ref_executor.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/solvers.hpp b/doc/headers/solvers.hpp index 481120c36a7..ac0f797cf02 100644 --- a/doc/headers/solvers.hpp +++ b/doc/headers/solvers.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/headers/stop.hpp b/doc/headers/stop.hpp index 2439d7d9162..16ce4487601 100644 --- a/doc/headers/stop.hpp +++ b/doc/headers/stop.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/helpers.cmake b/doc/helpers.cmake index cbda187a9a7..3cb4c636bc1 100644 --- a/doc/helpers.cmake +++ b/doc/helpers.cmake @@ -7,11 +7,11 @@ function(ginkgo_configure_to_string in variable) endfunction() macro(ginkgo_to_string variable) - set(${variable} "") - foreach(var ${ARGN}) - set(${variable} "${${variable}} ${var}") - endforeach() - string(STRIP "${${variable}}" ${variable}) + set(${variable} "") + foreach(var ${ARGN}) + set(${variable} "${${variable}} ${var}") + endforeach() + string(STRIP "${${variable}}" ${variable}) endmacro() # writes the concatenated configured files @@ -47,65 +47,66 @@ function(ginkgo_doc_gen name in pdf mainpage-in) set(doxyfile "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile-${name}") set(layout "${DOC_BASE}/DoxygenLayout.xml") ginkgo_file_concat("${DOC_BASE}/pages" - "${mainpage-in}" BASE_DOC.md "${MAINPAGE}" - ) + "${mainpage-in}" BASE_DOC.md "${MAINPAGE}" + ) set(doxygen_base_input - "${DOC_BASE}/headers/" - ) + "${DOC_BASE}/headers/" + ) list(APPEND doxygen_base_input - ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp - ${DIR_BASE}/include - ${MAINPAGE} - ) - if(GINKGO_DOC_GENERATE_EXAMPLES) - list(APPEND doxygen_base_input - ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp + ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp + ${DIR_BASE}/include + ${MAINPAGE} ) + if(GINKGO_DOC_GENERATE_EXAMPLES) + list(APPEND doxygen_base_input + ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp + ) endif() set(doxygen_dev_input - "${DIR_BASE}/core" - ) + "${DIR_BASE}/core" + ) list(APPEND doxygen_dev_input - ${DIR_BASE}/omp - ${DIR_BASE}/cuda - ${DIR_BASE}/reference - ) + ${DIR_BASE}/omp + ${DIR_BASE}/cuda + ${DIR_BASE}/hip + ${DIR_BASE}/reference + ) set(doxygen_image_path "${CMAKE_SOURCE_DIR}/doc/images/") file(GLOB doxygen_depend - ${DOC_BASE}/headers/*.hpp - ${DIR_BASE}/include/ginkgo/**/*.hpp - ) - list(APPEND doxygen_depend - ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp - ) - if(GINKGO_DOC_GENERATE_EXAMPLES) - list(APPEND doxygen_depend - ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp + ${DOC_BASE}/headers/*.hpp + ${DIR_BASE}/include/ginkgo/**/*.hpp ) - FILE(GLOB _ginkgo_examples - ${DIR_BASE}/examples/* + list(APPEND doxygen_depend + ${CMAKE_BINARY_DIR}/include/ginkgo/config.hpp ) - LIST(REMOVE_ITEM _ginkgo_examples "${DIR_BASE}/examples/CMakeLists.txt") - FOREACH(_ex ${_ginkgo_examples}) - GET_FILENAME_COMPONENT(_ex "${_ex}" NAME) - LIST(APPEND doxygen_depend - ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp - ) - LIST(APPEND doxygen_base_input - ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp - ) - ENDFOREACH() + if(GINKGO_DOC_GENERATE_EXAMPLES) + list(APPEND doxygen_depend + ${CMAKE_CURRENT_BINARY_DIR}/examples/examples.hpp + ) + FILE(GLOB _ginkgo_examples + ${DIR_BASE}/examples/* + ) + LIST(REMOVE_ITEM _ginkgo_examples "${DIR_BASE}/examples/CMakeLists.txt") + FOREACH(_ex ${_ginkgo_examples}) + GET_FILENAME_COMPONENT(_ex "${_ex}" NAME) + LIST(APPEND doxygen_depend + ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp + ) + LIST(APPEND doxygen_base_input + ${CMAKE_CURRENT_BINARY_DIR}/examples/${_ex}.hpp + ) + ENDFOREACH() endif() list(APPEND doxygen_dev_input - ${doxygen_base_input} - ) + ${doxygen_base_input} + ) # pick some markdown files we want as pages - set(doxygen_markdown_files "../../INSTALL.md ../../TESTING.md ../../BENCHMARKING.md") + set(doxygen_markdown_files "../../INSTALL.md ../../TESTING.md ../../BENCHMARKING.md ../../CONTRIBUTING.md ../../CITING.md") ginkgo_to_string(doxygen_base_input_str ${doxygen_base_input} ) ginkgo_to_string(doxygen_dev_input_str ${doxygen_dev_input} ) ginkgo_to_string(doxygen_image_path_str ${doxygen_image_path} ) add_custom_target("${name}" ALL - #DEPEND "${doxyfile}.stamp" Doxyfile.in ${in} ${in2} + #DEPEND "${doxyfile}.stamp" Doxyfile.in ${in} ${in2} COMMAND "${DOXYGEN_EXECUTABLE}" ${doxyfile} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} DEPENDS diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 77d1117667a..1ea92e19886 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -3,13 +3,17 @@ add_subdirectory(custom-logger) add_subdirectory(custom-matrix-format) add_subdirectory(custom-stopping-criterion) if(GINKGO_BUILD_EXTLIB_EXAMPLE) - add_subdirectory(external-lib-interfacing) + add_subdirectory(external-lib-interfacing) endif() +add_subdirectory(adaptiveprecision-blockjacobi) add_subdirectory(ginkgo-overhead) add_subdirectory(ginkgo-ranges) add_subdirectory(ilu-preconditioned-solver) +add_subdirectory(ir-ilu-preconditioned-solver) add_subdirectory(inverse-iteration) +add_subdirectory(iterative-refinement) add_subdirectory(minimal-cuda-solver) +add_subdirectory(mixed-precision-ir) add_subdirectory(nine-pt-stencil-solver) add_subdirectory(papi-logging) add_subdirectory(performance-debugging) @@ -19,4 +23,3 @@ add_subdirectory(simple-solver) add_subdirectory(simple-solver-logging) add_subdirectory(three-pt-stencil-solver) add_subdirectory(twentyseven-pt-stencil-solver) - diff --git a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt new file mode 100644 index 00000000000..d3188aaca12 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(adaptiveprecision-blockjacobi adaptiveprecision-blockjacobi.cpp) +target_link_libraries(adaptiveprecision-blockjacobi ginkgo) +target_include_directories(adaptiveprecision-blockjacobi PRIVATE ${PROJECT_SOURCE_DIR}) +configure_file(data/A.mtx data/A.mtx COPYONLY) diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp new file mode 100644 index 00000000000..3b58d78e37a --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp @@ -0,0 +1,146 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +#include + + +#include +#include +#include +#include + + +int main(int argc, char *argv[]) +{ + // Some shortcuts + using ValueType = double; + using IndexType = int; + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using bj = gko::preconditioner::Jacobi; + + // Print version information + std::cout << gko::version_info::get() << std::endl; + + // Figure out where to run the code + std::shared_ptr exec; + if (argc == 1 || std::string(argv[1]) == "reference") { + exec = gko::ReferenceExecutor::create(); + } else if (argc == 2 && std::string(argv[1]) == "omp") { + exec = gko::OmpExecutor::create(); + } else if (argc == 2 && std::string(argv[1]) == "cuda" && + gko::CudaExecutor::get_num_devices() > 0) { + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); + } else { + std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; + std::exit(-1); + } + + // Read data + auto A = share(gko::read(std::ifstream("data/A.mtx"), exec)); + // Create RHS and initial guess as 1 + gko::size_type size = A->get_size()[0]; + auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1)); + for (auto i = 0; i < size; i++) { + host_x->at(i, 0) = 1.; + } + auto x = vec::create(exec); + auto b = vec::create(exec); + x->copy_from(host_x.get()); + b->copy_from(host_x.get()); + + // Calculate initial residual by overwriting b + auto one = gko::initialize({1.0}, exec); + auto neg_one = gko::initialize({-1.0}, exec); + auto initres = gko::initialize({0.0}, exec); + A->apply(lend(one), lend(x), lend(neg_one), lend(b)); + b->compute_norm2(lend(initres)); + + // copy b again + b->copy_from(host_x.get()); + const gko::remove_complex reduction_factor = 1e-7; + auto iter_stop = + gko::stop::Iteration::build().with_max_iters(10000u).on(exec); + auto tol_stop = gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) + .on(exec); + + std::shared_ptr> logger = + gko::log::Convergence::create(exec); + iter_stop->add_logger(logger); + tol_stop->add_logger(logger); + + // Create solver factory + auto solver_gen = + cg::build() + .with_criteria(gko::share(iter_stop), gko::share(tol_stop)) + // Add preconditioner, these 2 lines are the only + // difference from the simple solver example + .with_preconditioner(bj::build() + .with_max_block_size(16u) + .with_storage_optimization( + gko::precision_reduction::autodetect()) + .on(exec)) + .on(exec); + // Create solver + auto solver = solver_gen->generate(A); + + + // Solve system + exec->synchronize(); + std::chrono::nanoseconds time(0); + auto tic = std::chrono::steady_clock::now(); + solver->apply(lend(b), lend(x)); + auto toc = std::chrono::steady_clock::now(); + time += std::chrono::duration_cast(toc - tic); + + // Calculate residual + auto res = gko::initialize({0.0}, exec); + A->apply(lend(one), lend(x), lend(neg_one), lend(b)); + b->compute_norm2(lend(res)); + + std::cout << "Initial residual norm sqrt(r^T r): \n"; + write(std::cout, lend(initres)); + std::cout << "Final residual norm sqrt(r^T r): \n"; + write(std::cout, lend(res)); + + // Print solver statistics + std::cout << "CG iteration count: " << logger->get_num_iterations() + << std::endl; + std::cout << "CG execution time [ms]: " + << static_cast(time.count()) / 1000000.0 << std::endl; +} diff --git a/examples/adaptiveprecision-blockjacobi/build.sh b/examples/adaptiveprecision-blockjacobi/build.sh new file mode 100755 index 00000000000..fba046ccb94 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/build.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# set up script +if [ $# -ne 1 ]; then + echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY" + exit 1 +fi +BUILD_DIR=$1 +THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) + +# copy libraries +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" +SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" +for prefix in ${LIBRARY_DIRS}; do + for name in ${LIBRARY_NAMES}; do + for suffix in ${SUFFIXES}; do + cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \ + ${THIS_DIR}/lib${name}${suffix} 2>/dev/null + done + done +done + +# figure out correct compiler flags +if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" +else + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" +fi +if [ -z "${CXX}" ]; then + CXX="c++" +fi + +# build +${CXX} -std=c++11 -o ${THIS_DIR}/adaptiveprecision-blockjacobi \ + ${THIS_DIR}/adaptiveprecision-blockjacobi.cpp \ + -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \ + -L${THIS_DIR} ${LINK_FLAGS} diff --git a/examples/adaptiveprecision-blockjacobi/data/A.mtx b/examples/adaptiveprecision-blockjacobi/data/A.mtx new file mode 100644 index 00000000000..c67437da567 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/data/A.mtx @@ -0,0 +1,114 @@ +%%MatrixMarket matrix coordinate integer symmetric +%------------------------------------------------------------------------------- +% UF Sparse Matrix Collection, Tim Davis +% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b +% name: JGD_Trefethen/Trefethen_20b +% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.] +% id: 2203 +% date: 2008 +% author: N. Trefethen +% ed: J.-G. Dumas +% fields: name title A id date author ed kind notes +% kind: combinatorial problem +%------------------------------------------------------------------------------- +% notes: +% Diagonal matrices with primes, Nick Trefethen, Oxford Univ. +% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection, +% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html +% +% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems, +% SIAM News, vol 35, no. 1. +% +% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero +% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the +% main diagonal and the number 1 in all the positions A(i,j) with +% |i-j| = 1,2,4,8, . . . ,16384. What is the (1,1) entry of inv(A)? +% +% http://www.siam.org/news/news.php?id=388 +% +% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms +%------------------------------------------------------------------------------- +19 19 83 +1 1 3 +2 1 1 +3 1 1 +5 1 1 +9 1 1 +17 1 1 +2 2 5 +3 2 1 +4 2 1 +6 2 1 +10 2 1 +18 2 1 +3 3 7 +4 3 1 +5 3 1 +7 3 1 +11 3 1 +19 3 1 +4 4 11 +5 4 1 +6 4 1 +8 4 1 +12 4 1 +5 5 13 +6 5 1 +7 5 1 +9 5 1 +13 5 1 +6 6 17 +7 6 1 +8 6 1 +10 6 1 +14 6 1 +7 7 19 +8 7 1 +9 7 1 +11 7 1 +15 7 1 +8 8 23 +9 8 1 +10 8 1 +12 8 1 +16 8 1 +9 9 29 +10 9 1 +11 9 1 +13 9 1 +17 9 1 +10 10 31 +11 10 1 +12 10 1 +14 10 1 +18 10 1 +11 11 37 +12 11 1 +13 11 1 +15 11 1 +19 11 1 +12 12 41 +13 12 1 +14 12 1 +16 12 1 +13 13 43 +14 13 1 +15 13 1 +17 13 1 +14 14 47 +15 14 1 +16 14 1 +18 14 1 +15 15 53 +16 15 1 +17 15 1 +19 15 1 +16 16 59 +17 16 1 +18 16 1 +17 17 61 +18 17 1 +19 17 1 +18 18 67 +19 18 1 +19 19 71 diff --git a/examples/adaptiveprecision-blockjacobi/doc/builds-on b/examples/adaptiveprecision-blockjacobi/doc/builds-on new file mode 100644 index 00000000000..9b64c9bfd28 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/doc/builds-on @@ -0,0 +1 @@ +preconditioned-solver diff --git a/examples/adaptiveprecision-blockjacobi/doc/intro.dox b/examples/adaptiveprecision-blockjacobi/doc/intro.dox new file mode 100644 index 00000000000..410f698f261 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/doc/intro.dox @@ -0,0 +1,10 @@ + +

This example shows how to use the adaptive precision block-Jacobi +preconditioner.

+ +

In this example, we first read in a matrix from file, then generate a +right-hand side and an initial guess. The preconditioned CG solver is enhanced +with a block-Jacobi preconditioner that optimizes the storage format for the +distinct inverted diagonal blocks to the numerical requirements. The example +features the iteration count and runtime of the CG solver.

+ diff --git a/examples/adaptiveprecision-blockjacobi/doc/kind b/examples/adaptiveprecision-blockjacobi/doc/kind new file mode 100644 index 00000000000..53a96d5771f --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/doc/kind @@ -0,0 +1 @@ +preconditioners diff --git a/examples/adaptiveprecision-blockjacobi/doc/results.dox b/examples/adaptiveprecision-blockjacobi/doc/results.dox new file mode 100644 index 00000000000..87c5b74c60c --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/doc/results.dox @@ -0,0 +1,19 @@ +

Results

+This is the expected output: + +@code{.cpp} + +Initial residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +194.679 +Final residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +2.8994e-11 +CG iteration count: 8 +CG execution time [ms]: 4.10581 + +@endcode + +

Comments about programming and debugging

diff --git a/examples/adaptiveprecision-blockjacobi/doc/short-intro b/examples/adaptiveprecision-blockjacobi/doc/short-intro new file mode 100644 index 00000000000..7aa3396bae6 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/doc/short-intro @@ -0,0 +1 @@ +The preconditioned solver example. diff --git a/examples/adaptiveprecision-blockjacobi/doc/tooltip b/examples/adaptiveprecision-blockjacobi/doc/tooltip new file mode 100644 index 00000000000..6458f7fb3e5 --- /dev/null +++ b/examples/adaptiveprecision-blockjacobi/doc/tooltip @@ -0,0 +1 @@ +Use a preconditioner in Ginkgo. Solve a linear system. diff --git a/examples/custom-logger/build.sh b/examples/custom-logger/build.sh index 5cd278063a8..67587b6aa90 100755 --- a/examples/custom-logger/build.sh +++ b/examples/custom-logger/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp index d7ecde9b576..d5ded538df7 100644 --- a/examples/custom-logger/custom-logger.cpp +++ b/examples/custom-logger/custom-logger.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -51,18 +51,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Utility function which gets the scalar value of a Ginkgo gko::matrix::Dense // matrix representing the norm of a vector. template -double get_norm(const gko::matrix::Dense *norm) +gko::remove_complex get_norm( + const gko::matrix::Dense *norm) { // Put the value on CPU thanks to the master executor auto cpu_norm = clone(norm->get_executor()->get_master(), norm); // Return the scalar value contained at position (0, 0) - return cpu_norm->at(0, 0); + return std::real(cpu_norm->at(0, 0)); } // Utility function which computes the norm of a Ginkgo gko::matrix::Dense // vector. template -double compute_norm(const gko::matrix::Dense *b) +gko::remove_complex compute_norm( + const gko::matrix::Dense *b) { // Get the executor of the vector auto exec = b->get_executor(); @@ -83,10 +85,10 @@ struct ResidualLogger : gko::log::Logger { void write() const { // Print a header for the table - std::cout << "Recurrent vs real residual norm:" << std::endl; + std::cout << "Recurrent vs true residual norm:" << std::endl; std::cout << '|' << std::setw(10) << "Iteration" << '|' << std::setw(25) << "Recurrent Residual Norm" << '|' << std::setw(25) - << "Real Residual Norm" << '|' << std::endl; + << "True Residual Norm" << '|' << std::endl; // Print a separation line. Note that for creating `10` characters // `std::setw()` should be set to `11`. std::cout << '|' << std::setfill('-') << std::setw(11) << '|' @@ -188,14 +190,16 @@ int main(int argc, char *argv[]) // with one column/one row. The advantage of this concept is that using // multiple vectors is a now a natural extension of adding columns/rows are // necessary. - using vec = gko::matrix::Dense<>; + using ValueType = double; + using IndexType = int; + using vec = gko::matrix::Dense; // The gko::matrix::Csr class is used here, but any other matrix class such // as gko::matrix::Coo, gko::matrix::Hybrid, gko::matrix::Ell or // gko::matrix::Sellp could also be used. - using mtx = gko::matrix::Csr<>; + using mtx = gko::matrix::Csr; // The gko::solver::Cg is used here, but any other solver class can also be // used. - using cg = gko::solver::Cg<>; + using cg = gko::solver::Cg; // Print the ginkgo version information. std::cout << gko::version_info::get() << std::endl; @@ -217,7 +221,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create()); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); @@ -234,6 +241,7 @@ int main(int argc, char *argv[]) auto A = share(gko::read(std::ifstream("data/A.mtx"), exec)); auto b = gko::read(std::ifstream("data/b.mtx"), exec); auto x = gko::read(std::ifstream("data/x0.mtx"), exec); + const gko::remove_complex reduction_factor = 1e-7; // @sect3{Creating the solver} // Generate the gko::solver factory. Ginkgo uses the concept of Factories to @@ -248,14 +256,14 @@ int main(int argc, char *argv[]) cg::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .on(exec); // Instantiate a ResidualLogger logger. - auto logger = std::make_shared>(exec, gko::lend(A), - gko::lend(b)); + auto logger = std::make_shared>( + exec, gko::lend(A), gko::lend(b)); // Add the previously created logger to the solver factory. The logger will // be automatically propagated to all solvers created from this factory. diff --git a/examples/custom-logger/doc/intro.dox b/examples/custom-logger/doc/intro.dox index a81e16f2111..5d135a65013 100644 --- a/examples/custom-logger/doc/intro.dox +++ b/examples/custom-logger/doc/intro.dox @@ -9,13 +9,13 @@ In this example, a simple logger is implemented to track the solver's recurrent

About the example

Each example has the following sections:
    -
  1. Introduction:This gives an overview of the example and mentions - any interesting aspects in the example that might help the reader. -
  2. The commented program: This section is intended for you to - understand the details of the example so that you can play with it and understand - Ginkgo and its features better. -
  3. Results: This section shows the results of the code when run. Though the - results may not be completely the same, you can expect the behaviour to be similar. -
  4. The plain program: This is the complete code without any comments to have - an complete overview of the code. -
+
  • Introduction:This gives an overview of the example and mentions + any interesting aspects in the example that might help the reader. +
  • The commented program: This section is intended for you to + understand the details of the example so that you can play with it and understand + Ginkgo and its features better. +
  • Results: This section shows the results of the code when run. Though the + results may not be completely the same, you can expect the behaviour to be similar. +
  • The plain program: This is the complete code without any comments to have + an complete overview of the code. + diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index 383ec42c735..d9633e1ab11 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -7,5 +7,7 @@ if (GINKGO_BUILD_CUDA AND GINKGO_BUILD_OMP) stencil_kernel.cu) target_link_libraries(custom-matrix-format ginkgo) target_include_directories(custom-matrix-format PRIVATE - ${PROJECT_SOURCE_DIR}) + ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) + # workaround for clang-cuda/g++ interaction + set_target_properties(custom-matrix-format PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() diff --git a/examples/custom-matrix-format/build.sh b/examples/custom-matrix-format/build.sh index 6471c90fbfe..dbb2a67d72b 100755 --- a/examples/custom-matrix-format/build.sh +++ b/examples/custom-matrix-format/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi CXX="nvcc" diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp index 5c4e750f7f9..2ed33b59bea 100644 --- a/examples/custom-matrix-format/custom-matrix-format.cpp +++ b/examples/custom-matrix-format/custom-matrix-format.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,8 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // A CUDA kernel implementing the stencil, which will be used if running on the // CUDA executor. Unfortunately, NVCC has serious problems interpreting some // parts of Ginkgo's code, so the kernel has to be compiled separately. -extern void stencil_kernel(std::size_t size, const double *coefs, - const double *b, double *x); +template +void stencil_kernel(std::size_t size, const ValueType *coefs, + const ValueType *b, ValueType *x); // A stencil matrix class representing the 3pt stencil linear operator. @@ -57,21 +58,22 @@ extern void stencil_kernel(std::size_t size, const double *coefs, // implementation of the static create method. This method will forward all its // arguments to the constructor to create the object, and return an // std::unique_ptr to the created object. -class StencilMatrix : public gko::EnableLinOp, - public gko::EnableCreateMethod { +template +class StencilMatrix : public gko::EnableLinOp>, + public gko::EnableCreateMethod> { public: // This constructor will be called by the create method. Here we initialize // the coefficients of the stencil. StencilMatrix(std::shared_ptr exec, - gko::size_type size = 0, double left = -1.0, - double center = 2.0, double right = -1.0) + gko::size_type size = 0, ValueType left = -1.0, + ValueType center = 2.0, ValueType right = -1.0) : gko::EnableLinOp(exec, gko::dim<2>{size}), coefficients(exec, {left, center, right}) {} protected: - using vec = gko::matrix::Dense<>; - using coef_type = gko::Array; + using vec = gko::matrix::Dense; + using coef_type = gko::Array; // Here we implement the application of the linear operator, x = A * b. // apply_impl will be called by the apply method, after the arguments have @@ -156,14 +158,15 @@ class StencilMatrix : public gko::EnableLinOp, // Creates a stencil matrix in CSR format for the given number of discretization // points. -void generate_stencil_matrix(gko::matrix::Csr<> *matrix) +template +void generate_stencil_matrix(gko::matrix::Csr *matrix) { const auto discretization_points = matrix->get_size()[0]; auto row_ptrs = matrix->get_row_ptrs(); auto col_idxs = matrix->get_col_idxs(); auto values = matrix->get_values(); - int pos = 0; - const double coefs[] = {-1, 2, -1}; + IndexType pos = 0; + const ValueType coefs[] = {-1, 2, -1}; row_ptrs[0] = pos; for (int i = 0; i < discretization_points; ++i) { for (auto ofs : {-1, 0, 1}) { @@ -179,14 +182,15 @@ void generate_stencil_matrix(gko::matrix::Csr<> *matrix) // Generates the RHS vector given `f` and the boundary conditions. -template -void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs) +template +void generate_rhs(Closure f, ValueType u0, ValueType u1, + gko::matrix::Dense *rhs) { const auto discretization_points = rhs->get_size()[0]; auto values = rhs->get_values(); - const auto h = 1.0 / (discretization_points + 1); + const ValueType h = 1.0 / (discretization_points + 1); for (int i = 0; i < discretization_points; ++i) { - const auto xi = (i + 1) * h; + const ValueType xi = ValueType(i + 1) * h; values[i] = -f(xi) * h * h; } values[0] += u0; @@ -195,7 +199,9 @@ void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs) // Prints the solution `u`. -void print_solution(double u0, double u1, const gko::matrix::Dense<> *u) +template +void print_solution(ValueType u0, ValueType u1, + const gko::matrix::Dense *u) { std::cout << u0 << '\n'; for (int i = 0; i < u->get_size()[0]; ++i) { @@ -207,8 +213,9 @@ void print_solution(double u0, double u1, const gko::matrix::Dense<> *u) // Computes the 1-norm of the error given the computed `u` and the correct // solution function `correct_u`. -template -double calculate_error(int discretization_points, const gko::matrix::Dense<> *u, +template +double calculate_error(int discretization_points, + const gko::matrix::Dense *u, Closure correct_u) { const auto h = 1.0 / (discretization_points + 1); @@ -226,9 +233,12 @@ double calculate_error(int discretization_points, const gko::matrix::Dense<> *u, int main(int argc, char *argv[]) { // Some shortcuts - using vec = gko::matrix::Dense; - using mtx = gko::matrix::Csr; - using cg = gko::solver::Cg; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; if (argc < 2) { std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]" @@ -245,7 +255,8 @@ int main(int argc, char *argv[]) const auto omp = gko::OmpExecutor::create(); std::map> exec_map{ {"omp", omp}, - {"cuda", gko::CudaExecutor::create(0, omp)}, + {"cuda", gko::CudaExecutor::create(0, omp, true)}, + {"hip", gko::HipExecutor::create(0, omp, true)}, {"reference", gko::ReferenceExecutor::create()}}; // executor where Ginkgo will perform the computation @@ -254,8 +265,8 @@ int main(int argc, char *argv[]) const auto app_exec = exec_map["omp"]; // problem: - auto correct_u = [](double x) { return x * x * x; }; - auto f = [](double x) { return 6 * x; }; + auto correct_u = [](ValueType x) { return x * x * x; }; + auto f = [](ValueType x) { return ValueType(6) * x; }; auto u0 = correct_u(0); auto u1 = correct_u(1); @@ -267,19 +278,20 @@ int main(int argc, char *argv[]) u->get_values()[i] = 0.0; } + const ValueType reduction_factor = 1e-7; // Generate solver and solve the system cg::build() .with_criteria(gko::stop::Iteration::build() .with_max_iters(discretization_points) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .on(exec) // notice how our custom StencilMatrix can be used in the same way as // any built-in type - ->generate( - StencilMatrix::create(exec, discretization_points, -1, 2, -1)) + ->generate(StencilMatrix::create(exec, discretization_points, + -1, 2, -1)) ->apply(lend(rhs), lend(u)); print_solution(u0, u1, lend(u)); diff --git a/examples/custom-matrix-format/stencil_kernel.cu b/examples/custom-matrix-format/stencil_kernel.cu index 66a471ad65e..fdd04d1aa3d 100644 --- a/examples/custom-matrix-format/stencil_kernel.cu +++ b/examples/custom-matrix-format/stencil_kernel.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,13 +32,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + +#define INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ + template _macro(float); \ + template _macro(double); + + +#define STENCIL_KERNEL(_type) \ + void stencil_kernel(std::size_t size, const _type *coefs, const _type *b, \ + _type *x); + namespace { // a parallel CUDA kernel that computes the application of a 3 point stencil -__global__ void stencil_kernel_impl(std::size_t size, const double *coefs, - const double *b, double *x) +template +__global__ void stencil_kernel_impl(std::size_t size, const ValueType *coefs, + const ValueType *b, ValueType *x) { const auto thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id >= size) { @@ -58,10 +71,13 @@ __global__ void stencil_kernel_impl(std::size_t size, const double *coefs, } // namespace -void stencil_kernel(std::size_t size, const double *coefs, const double *b, - double *x) +template +void stencil_kernel(std::size_t size, const ValueType *coefs, + const ValueType *b, ValueType *x) { constexpr auto block_size = 512; const auto grid_size = (size + block_size - 1) / block_size; stencil_kernel_impl<<>>(size, coefs, b, x); } + +INSTANTIATE_FOR_EACH_VALUE_TYPE(STENCIL_KERNEL); \ No newline at end of file diff --git a/examples/custom-stopping-criterion/build.sh b/examples/custom-stopping-criterion/build.sh index 8f4ba176c25..410f3e3c9cb 100755 --- a/examples/custom-stopping-criterion/build.sh +++ b/examples/custom-stopping-criterion/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lpthread -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lpthread -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lpthread -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lpthread -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index ebfa2789eb6..975e20f3a6c 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -93,9 +93,12 @@ void run_solver(volatile bool *stop_iteration_process, std::shared_ptr exec) { // Some shortcuts - using mtx = gko::matrix::Csr<>; - using vec = gko::matrix::Dense<>; - using bicg = gko::solver::Bicgstab<>; + using ValueType = double; + using IndexType = int; + + using mtx = gko::matrix::Csr; + using vec = gko::matrix::Dense; + using bicg = gko::solver::Bicgstab; // Read Data auto A = share(gko::read(std::ifstream("data/A.mtx"), exec)); @@ -110,7 +113,7 @@ void run_solver(volatile bool *stop_iteration_process, .on(exec)) .on(exec) ->generate(A); - solver->add_logger(gko::log::Stream<>::create( + solver->add_logger(gko::log::Stream::create( exec, gko::log::Logger::iteration_complete_mask, std::cout, true)); solver->apply(lend(b), lend(x)); @@ -145,7 +148,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); diff --git a/examples/external-lib-interfacing/CMakeLists.txt b/examples/external-lib-interfacing/CMakeLists.txt index 2a12a9aa02f..0858b5e40d7 100644 --- a/examples/external-lib-interfacing/CMakeLists.txt +++ b/examples/external-lib-interfacing/CMakeLists.txt @@ -1,29 +1,29 @@ if(GINKGO_BUILD_EXTLIB_EXAMPLE) -# This is just an example of the CMakeLists.txt file that can be used after the -# correct version of deal.ii has been installed. -cmake_minimum_required(VERSION 3.8) -project(DEAL_II_EXAMPLE LANGUAGES CXX) + # This is just an example of the CMakeLists.txt file that can be used after the + # correct version of deal.ii has been installed. + cmake_minimum_required(VERSION 3.8) + project(DEAL_II_EXAMPLE LANGUAGES CXX) -find_package(MPI REQUIRED) + find_package(MPI REQUIRED) -set(deal.II_DIR "/path/to/deal.ii/installation") -find_package(deal.II 9.0.0 REQUIRED - HINTS ${deal.II_DIR} ${DEAL_II_DIR}) -DEAL_II_INITIALIZE_CACHED_VARIABLES() + set(deal.II_DIR "/path/to/deal.ii/installation") + find_package(deal.II 9.0.0 REQUIRED + HINTS ${deal.II_DIR} ${DEAL_II_DIR}) + DEAL_II_INITIALIZE_CACHED_VARIABLES() -set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -add_executable(${PROJECT_NAME} "") -target_sources(${PROJECT_NAME} PRIVATE external-lib-interfacing.cpp) -target_compile_options(${PROJECT_NAME} PRIVATE -g -Wall) -target_compile_definitions(${PROJECT_NAME} PRIVATE OMPI_SKIP_MPICXX) + add_executable(${PROJECT_NAME} "") + target_sources(${PROJECT_NAME} PRIVATE external-lib-interfacing.cpp) + target_compile_options(${PROJECT_NAME} PRIVATE -g -Wall) + target_compile_definitions(${PROJECT_NAME} PRIVATE OMPI_SKIP_MPICXX) -target_link_libraries(${PROJECT_NAME} - ${MPI_C_LIBRARIES} Ginkgo::ginkgo) + target_link_libraries(${PROJECT_NAME} + ${MPI_C_LIBRARIES} Ginkgo::ginkgo) -target_include_directories(${PROJECT_NAME} - PRIVATE ${MPI_C_INCLUDE_PATH} ${GINKGO_INC_DIR} ${GINKGO_LIB_DIR} ) + target_include_directories(${PROJECT_NAME} + PRIVATE ${MPI_C_INCLUDE_PATH} ${GINKGO_INC_DIR} ${GINKGO_LIB_DIR} ) -DEAL_II_SETUP_TARGET(${PROJECT_NAME}) + DEAL_II_SETUP_TARGET(${PROJECT_NAME}) endif() diff --git a/examples/ginkgo-overhead/build.sh b/examples/ginkgo-overhead/build.sh index 47c10e673d9..9c3fd902cfa 100755 --- a/examples/ginkgo-overhead/build.sh +++ b/examples/ginkgo-overhead/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/ginkgo-overhead/ginkgo-overhead.cpp b/examples/ginkgo-overhead/ginkgo-overhead.cpp index 2e73149c73f..b8bc7acc2b1 100644 --- a/examples/ginkgo-overhead/ginkgo-overhead.cpp +++ b/examples/ginkgo-overhead/ginkgo-overhead.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -47,9 +47,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main(int argc, char *argv[]) { - using vec = gko::matrix::Dense<>; - using mtx = gko::matrix::Dense<>; - using cg = gko::solver::Cg<>; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; long unsigned num_iters = 1000000; if (argc > 2) { @@ -87,8 +90,11 @@ int main(int argc, char *argv[]) auto time = std::chrono::duration_cast(tac - tic); std::cout << "Running " << num_iters << " iterations of the CG solver took a total of " - << 1.0 * time.count() / std::nano::den << " seconds." << std::endl + << static_cast(time.count()) / + static_cast(std::nano::den) + << " seconds." << std::endl << "\tAverage library overhead: " - << 1.0 * time.count() / num_iters << " [nanoseconds / iteration]" - << std::endl; + << static_cast(time.count()) / + static_cast(num_iters) + << " [nanoseconds / iteration]" << std::endl; } diff --git a/examples/ginkgo-ranges/build.sh b/examples/ginkgo-ranges/build.sh index d53d6287c24..012cf07c9ba 100755 --- a/examples/ginkgo-ranges/build.sh +++ b/examples/ginkgo-ranges/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/ginkgo-ranges/ginkgo-ranges.cpp b/examples/ginkgo-ranges/ginkgo-ranges.cpp index 0e5f5d37f30..c471f967d60 100644 --- a/examples/ginkgo-ranges/ginkgo-ranges.cpp +++ b/examples/ginkgo-ranges/ginkgo-ranges.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -82,13 +82,16 @@ void print_lu(const gko::range &A) int main(int argc, char *argv[]) { + using ValueType = double; + using IndexType = int; + // Print version information std::cout << gko::version_info::get() << std::endl; // Create some test data, add some padding just to demonstrate how to use it // with ranges. // clang-format off - double data[] = { + ValueType data[] = { 2., 4., 5., -1.0, 4., 11., 12., -1.0, 6., 24., 24., -1.0 @@ -97,7 +100,8 @@ int main(int argc, char *argv[]) // Create a 3-by-3 range, with a 2D row-major accessor using data as the // underlying storage. Set the stride (a.k.a. "LDA") to 4. - auto A = gko::range>(data, 3u, 3u, 4u); + auto A = + gko::range>(data, 3u, 3u, 4u); // use the LU factorization routine defined above to factorize the matrix factorize(A); diff --git a/examples/ilu-preconditioned-solver/build.sh b/examples/ilu-preconditioned-solver/build.sh index e8135b95328..a21f2e37584 100755 --- a/examples/ilu-preconditioned-solver/build.sh +++ b/examples/ilu-preconditioned-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp index d616a68e6fb..3d61d61d5c8 100644 --- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp +++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,9 +43,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main(int argc, char *argv[]) { // Some shortcuts - using vec = gko::matrix::Dense<>; - using mtx = gko::matrix::Csr<>; - using gmres = gko::solver::Gmres<>; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using gmres = gko::solver::Gmres; // Print version information std::cout << gko::version_info::get() << std::endl; @@ -58,7 +61,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); @@ -70,15 +76,17 @@ int main(int argc, char *argv[]) auto x = gko::read(std::ifstream("data/x0.mtx"), exec); // Generate incomplete factors using ParILU - auto par_ilu_fact = gko::factorization::ParIlu<>::build().on(exec); + auto par_ilu_fact = + gko::factorization::ParIlu::build().on(exec); // Generate concrete factorization for input matrix auto par_ilu = par_ilu_fact->generate(A); // Generate an ILU preconditioner factory by setting lower and upper // triangular solver - in this case the exact triangular solves auto ilu_pre_factory = - gko::preconditioner::Ilu, - gko::solver::UpperTrs<>, false>::build() + gko::preconditioner::Ilu, + gko::solver::UpperTrs, + false>::build() .on(exec); // Use incomplete factors to generate ILU preconditioner @@ -88,12 +96,13 @@ int main(int argc, char *argv[]) // Generating a solver factory tied to a specific preconditioner makes sense // if there are several very similar systems to solve, and the same // solver+preconditioner combination is expected to be effective. + const gko::remove_complex reduction_factor = 1e-7; auto ilu_gmres_factory = - gko::solver::Gmres<>::build() + gmres::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(1000u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .with_generated_preconditioner(gko::share(ilu_preconditioner)) .on(exec); diff --git a/examples/inverse-iteration/build.sh b/examples/inverse-iteration/build.sh index 7b47813df38..628f7260a01 100755 --- a/examples/inverse-iteration/build.sh +++ b/examples/inverse-iteration/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp index 8eb68728bae..856483bfbd6 100644 --- a/examples/inverse-iteration/inverse-iteration.cpp +++ b/examples/inverse-iteration/inverse-iteration.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -66,7 +66,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); diff --git a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt new file mode 100644 index 00000000000..dd77e163e59 --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(ir-ilu-preconditioned-solver ir-ilu-preconditioned-solver.cpp) +target_link_libraries(ir-ilu-preconditioned-solver ginkgo) +target_include_directories(ir-ilu-preconditioned-solver PRIVATE ${PROJECT_SOURCE_DIR}) +configure_file(data/A.mtx data/A.mtx COPYONLY) diff --git a/examples/ir-ilu-preconditioned-solver/build.sh b/examples/ir-ilu-preconditioned-solver/build.sh new file mode 100755 index 00000000000..e3e8b513daa --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/build.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# set up script +if [ $# -ne 1 ]; then + echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY" + exit 1 +fi +BUILD_DIR=$1 +THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) + +# copy libraries +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" +SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" +for prefix in ${LIBRARY_DIRS}; do + for name in ${LIBRARY_NAMES}; do + for suffix in ${SUFFIXES}; do + cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \ + ${THIS_DIR}/lib${name}${suffix} 2>/dev/null + done + done +done + +# figure out correct compiler flags +if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" +else + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" +fi +if [ -z "${CXX}" ]; then + CXX="c++" +fi + +# build +${CXX} -std=c++11 -o ${THIS_DIR}/ir-ilu-preconditioned-solver \ + ${THIS_DIR}/ir-ilu-preconditioned-solver.cpp \ + -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \ + -L${THIS_DIR} ${LINK_FLAGS} diff --git a/examples/ir-ilu-preconditioned-solver/data/A.mtx b/examples/ir-ilu-preconditioned-solver/data/A.mtx new file mode 100644 index 00000000000..c67437da567 --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/data/A.mtx @@ -0,0 +1,114 @@ +%%MatrixMarket matrix coordinate integer symmetric +%------------------------------------------------------------------------------- +% UF Sparse Matrix Collection, Tim Davis +% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b +% name: JGD_Trefethen/Trefethen_20b +% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.] +% id: 2203 +% date: 2008 +% author: N. Trefethen +% ed: J.-G. Dumas +% fields: name title A id date author ed kind notes +% kind: combinatorial problem +%------------------------------------------------------------------------------- +% notes: +% Diagonal matrices with primes, Nick Trefethen, Oxford Univ. +% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection, +% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html +% +% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems, +% SIAM News, vol 35, no. 1. +% +% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero +% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the +% main diagonal and the number 1 in all the positions A(i,j) with +% |i-j| = 1,2,4,8, . . . ,16384. What is the (1,1) entry of inv(A)? +% +% http://www.siam.org/news/news.php?id=388 +% +% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms +%------------------------------------------------------------------------------- +19 19 83 +1 1 3 +2 1 1 +3 1 1 +5 1 1 +9 1 1 +17 1 1 +2 2 5 +3 2 1 +4 2 1 +6 2 1 +10 2 1 +18 2 1 +3 3 7 +4 3 1 +5 3 1 +7 3 1 +11 3 1 +19 3 1 +4 4 11 +5 4 1 +6 4 1 +8 4 1 +12 4 1 +5 5 13 +6 5 1 +7 5 1 +9 5 1 +13 5 1 +6 6 17 +7 6 1 +8 6 1 +10 6 1 +14 6 1 +7 7 19 +8 7 1 +9 7 1 +11 7 1 +15 7 1 +8 8 23 +9 8 1 +10 8 1 +12 8 1 +16 8 1 +9 9 29 +10 9 1 +11 9 1 +13 9 1 +17 9 1 +10 10 31 +11 10 1 +12 10 1 +14 10 1 +18 10 1 +11 11 37 +12 11 1 +13 11 1 +15 11 1 +19 11 1 +12 12 41 +13 12 1 +14 12 1 +16 12 1 +13 13 43 +14 13 1 +15 13 1 +17 13 1 +14 14 47 +15 14 1 +16 14 1 +18 14 1 +15 15 53 +16 15 1 +17 15 1 +19 15 1 +16 16 59 +17 16 1 +18 16 1 +17 17 61 +18 17 1 +19 17 1 +18 18 67 +19 18 1 +19 19 71 diff --git a/examples/ir-ilu-preconditioned-solver/doc/builds-on b/examples/ir-ilu-preconditioned-solver/doc/builds-on new file mode 100644 index 00000000000..7c236123b46 --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/doc/builds-on @@ -0,0 +1 @@ +ilu-preconditioned-solver iterative-refinement diff --git a/examples/ir-ilu-preconditioned-solver/doc/intro.dox b/examples/ir-ilu-preconditioned-solver/doc/intro.dox new file mode 100644 index 00000000000..64e3322a219 --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/doc/intro.dox @@ -0,0 +1,15 @@ + +

    Introduction

    + +

    About the example

    +This example shows how to combine iterative refinement with the adaptive +precision block-Jacobi preconditioner in order to approximately solve the +triangular systems occurring in ILU preconditioning. Using an adaptive precision +block-Jacobi preconditioner matrix as inner solver for the iterative refinement +method is equivalent to doing adaptive precision block-Jacobi relaxation in the +triangular solves. This example roughly approximates the triangular solves with +five adaptive precision block-Jacobi sweeps with a maximum block size of 16. + +This example is motivated by "Multiprecision block-Jacobi for Iterative +Triangular Solves" (Göbel, Anzt, Cojean, Flegar, Quintana-Ortí, Euro-Par 2020). +The theory and a detailed analysis can be found there. diff --git a/examples/ir-ilu-preconditioned-solver/doc/kind b/examples/ir-ilu-preconditioned-solver/doc/kind new file mode 100644 index 00000000000..53a96d5771f --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/doc/kind @@ -0,0 +1 @@ +preconditioners diff --git a/examples/ir-ilu-preconditioned-solver/doc/results.dox b/examples/ir-ilu-preconditioned-solver/doc/results.dox new file mode 100644 index 00000000000..eaaaa5758cd --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/doc/results.dox @@ -0,0 +1,37 @@ +

    Results

    +This is the expected output: + +@code{.cpp} +Using 5 block-Jacobi sweeps. +Solution (x): +%%MatrixMarket matrix array real general +19 1 +0.252218 +0.108645 +0.0662811 +0.0630433 +0.0384088 +0.0396536 +0.0402648 +0.0338935 +0.0193098 +0.0234653 +0.0211499 +0.0196413 +0.0199151 +0.0181674 +0.0162722 +0.0150714 +0.0107016 +0.0121141 +0.0123025 +GMRES iteration count: 7 +GMRES execution time [ms]: 2.64993 +Residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +2.23805e-10 + +@endcode + +

    Comments about programming and debugging

    diff --git a/examples/ir-ilu-preconditioned-solver/doc/short-intro b/examples/ir-ilu-preconditioned-solver/doc/short-intro new file mode 100644 index 00000000000..3f8cd5ad813 --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/doc/short-intro @@ -0,0 +1 @@ +The IR-ILU preconditioned solver example. diff --git a/examples/ir-ilu-preconditioned-solver/doc/tooltip b/examples/ir-ilu-preconditioned-solver/doc/tooltip new file mode 100644 index 00000000000..7b7208257aa --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/doc/tooltip @@ -0,0 +1,5 @@ +Generate an incomplete factorization. +Generate an ILU preconditioner from a factorization. +Use an iterative solver to solve the triangular systems in the preconditioner. +Use an ILU preconditioner in an iterative solver. +Solve a linear system. diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp new file mode 100644 index 00000000000..620e755d490 --- /dev/null +++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp @@ -0,0 +1,185 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +#include + + +#include +#include +#include +#include + + +int main(int argc, char *argv[]) +{ + // Some shortcuts + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using gmres = gko::solver::Gmres; + using ir = gko::solver::Ir; + using bj = gko::preconditioner::Jacobi; + + // Print version information + std::cout << gko::version_info::get() << std::endl; + + // Figure out where to run the code and how many block-Jacobi sweeps to use + std::shared_ptr exec; + if (argc == 1 || std::string(argv[1]) == "reference") { + exec = gko::ReferenceExecutor::create(); + } else if ((argc == 2 || argc == 3) && std::string(argv[1]) == "omp") { + exec = gko::OmpExecutor::create(); + } else if ((argc == 2 || argc == 3) && std::string(argv[1]) == "cuda" && + gko::CudaExecutor::get_num_devices() > 0) { + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if ((argc == 2 || argc == 3) && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); + } else { + std::cerr << "Usage: " << argv[0] << " [executor] [sweeps]" + << std::endl; + std::exit(-1); + } + unsigned int sweeps = (argc == 3) ? atoi(argv[2]) : 5u; + + // Read data + auto A = gko::share(gko::read(std::ifstream("data/A.mtx"), exec)); + // Create RHS and initial guess as 1 + gko::size_type num_rows = A->get_size()[0]; + auto host_x = vec::create(exec->get_master(), gko::dim<2>(num_rows, 1)); + for (gko::size_type i = 0; i < num_rows; i++) { + host_x->at(i, 0) = 1.; + } + auto x = vec::create(exec); + auto b = vec::create(exec); + x->copy_from(host_x.get()); + b->copy_from(host_x.get()); + auto clone_x = vec::create(exec); + clone_x->copy_from(lend(x)); + + // Generate incomplete factors using ParILU + auto par_ilu_fact = + gko::factorization::ParIlu::build().on(exec); + // Generate concrete factorization for input matrix + auto par_ilu = par_ilu_fact->generate(A); + + // Generate an iterative refinement factory to be used as a triangular + // solver in the preconditioner application. The generated method is + // equivalent to doing five block-Jacobi sweeps with a maximum block size + // of 16. + auto bj_factory = + bj::build() + .with_max_block_size(16u) + .with_storage_optimization(gko::precision_reduction::autodetect()) + .on(exec); + + auto trisolve_factory = + ir::build() + .with_solver(share(bj_factory)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(sweeps).on(exec)) + .on(exec); + + // Generate an ILU preconditioner factory by setting lower and upper + // triangular solver - in this case the previously defined iterative + // refinement method. + auto ilu_pre_factory = + gko::preconditioner::Ilu::build() + .with_l_solver_factory(gko::clone(trisolve_factory)) + .with_u_solver_factory(gko::clone(trisolve_factory)) + .on(exec); + + // Use incomplete factors to generate ILU preconditioner + auto ilu_preconditioner = ilu_pre_factory->generate(gko::share(par_ilu)); + + // Create stopping criteria for Gmres + const gko::remove_complex reduction_factor = 1e-12; + auto iter_stop = + gko::stop::Iteration::build().with_max_iters(1000u).on(exec); + auto tol_stop = gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) + .on(exec); + + std::shared_ptr> logger = + gko::log::Convergence::create(exec); + iter_stop->add_logger(logger); + tol_stop->add_logger(logger); + + // Use preconditioner inside GMRES solver factory + // Generating a solver factory tied to a specific preconditioner makes sense + // if there are several very similar systems to solve, and the same + // solver+preconditioner combination is expected to be effective. + auto ilu_gmres_factory = + gmres::build() + .with_criteria(gko::share(iter_stop), gko::share(tol_stop)) + .with_generated_preconditioner(gko::share(ilu_preconditioner)) + .on(exec); + + // Generate preconditioned solver for a specific target system + auto ilu_gmres = ilu_gmres_factory->generate(A); + + // Warmup run + ilu_gmres->apply(lend(b), lend(x)); + + // Solve system 100 times and take the average time. + std::chrono::nanoseconds time(0); + for (int i = 0; i < 100; i++) { + x->copy_from(lend(clone_x)); + auto tic = std::chrono::high_resolution_clock::now(); + ilu_gmres->apply(lend(b), lend(x)); + auto toc = std::chrono::high_resolution_clock::now(); + time += std::chrono::duration_cast(toc - tic); + } + + std::cout << "Using " << sweeps << " block-Jacobi sweeps. \n"; + + // Print solution + std::cout << "Solution (x): \n"; + write(std::cout, gko::lend(x)); + + // Calculate residual + auto one = gko::initialize({1.0}, exec); + auto neg_one = gko::initialize({-1.0}, exec); + auto res = gko::initialize({0.0}, exec); + A->apply(gko::lend(one), gko::lend(x), gko::lend(neg_one), gko::lend(b)); + b->compute_norm2(gko::lend(res)); + + std::cout << "GMRES iteration count: " << logger->get_num_iterations() + << "\n"; + std::cout << "GMRES execution time [ms]: " + << static_cast(time.count()) / 100000000.0 << "\n"; + std::cout << "Residual norm sqrt(r^T r): \n"; + write(std::cout, gko::lend(res)); +} diff --git a/examples/iterative-refinement/CMakeLists.txt b/examples/iterative-refinement/CMakeLists.txt new file mode 100644 index 00000000000..a21b54d2a96 --- /dev/null +++ b/examples/iterative-refinement/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(iterative-refinement iterative-refinement.cpp) +target_link_libraries(iterative-refinement ginkgo) +target_include_directories(iterative-refinement PRIVATE ${PROJECT_SOURCE_DIR}) +configure_file(data/A.mtx data/A.mtx COPYONLY) diff --git a/examples/iterative-refinement/build.sh b/examples/iterative-refinement/build.sh new file mode 100755 index 00000000000..06f7d201f1b --- /dev/null +++ b/examples/iterative-refinement/build.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# set up script +if [ $# -ne 1 ]; then + echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY" + exit 1 +fi +BUILD_DIR=$1 +THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) + +# copy libraries +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" +SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" +for prefix in ${LIBRARY_DIRS}; do + for name in ${LIBRARY_NAMES}; do + for suffix in ${SUFFIXES}; do + cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \ + ${THIS_DIR}/lib${name}${suffix} 2>/dev/null + done + done +done + +# figure out correct compiler flags +if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" +else + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" +fi +if [ -z "${CXX}" ]; then + CXX="c++" +fi + +# build +${CXX} -std=c++11 -o ${THIS_DIR}/iterative-refinement \ + ${THIS_DIR}/iterative-refinement.cpp \ + -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \ + -L${THIS_DIR} ${LINK_FLAGS} diff --git a/examples/iterative-refinement/data/A.mtx b/examples/iterative-refinement/data/A.mtx new file mode 100644 index 00000000000..c67437da567 --- /dev/null +++ b/examples/iterative-refinement/data/A.mtx @@ -0,0 +1,114 @@ +%%MatrixMarket matrix coordinate integer symmetric +%------------------------------------------------------------------------------- +% UF Sparse Matrix Collection, Tim Davis +% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b +% name: JGD_Trefethen/Trefethen_20b +% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.] +% id: 2203 +% date: 2008 +% author: N. Trefethen +% ed: J.-G. Dumas +% fields: name title A id date author ed kind notes +% kind: combinatorial problem +%------------------------------------------------------------------------------- +% notes: +% Diagonal matrices with primes, Nick Trefethen, Oxford Univ. +% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection, +% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html +% +% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems, +% SIAM News, vol 35, no. 1. +% +% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero +% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the +% main diagonal and the number 1 in all the positions A(i,j) with +% |i-j| = 1,2,4,8, . . . ,16384. What is the (1,1) entry of inv(A)? +% +% http://www.siam.org/news/news.php?id=388 +% +% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms +%------------------------------------------------------------------------------- +19 19 83 +1 1 3 +2 1 1 +3 1 1 +5 1 1 +9 1 1 +17 1 1 +2 2 5 +3 2 1 +4 2 1 +6 2 1 +10 2 1 +18 2 1 +3 3 7 +4 3 1 +5 3 1 +7 3 1 +11 3 1 +19 3 1 +4 4 11 +5 4 1 +6 4 1 +8 4 1 +12 4 1 +5 5 13 +6 5 1 +7 5 1 +9 5 1 +13 5 1 +6 6 17 +7 6 1 +8 6 1 +10 6 1 +14 6 1 +7 7 19 +8 7 1 +9 7 1 +11 7 1 +15 7 1 +8 8 23 +9 8 1 +10 8 1 +12 8 1 +16 8 1 +9 9 29 +10 9 1 +11 9 1 +13 9 1 +17 9 1 +10 10 31 +11 10 1 +12 10 1 +14 10 1 +18 10 1 +11 11 37 +12 11 1 +13 11 1 +15 11 1 +19 11 1 +12 12 41 +13 12 1 +14 12 1 +16 12 1 +13 13 43 +14 13 1 +15 13 1 +17 13 1 +14 14 47 +15 14 1 +16 14 1 +18 14 1 +15 15 53 +16 15 1 +17 15 1 +19 15 1 +16 16 59 +17 16 1 +18 16 1 +17 17 61 +18 17 1 +19 17 1 +18 18 67 +19 18 1 +19 19 71 diff --git a/examples/iterative-refinement/doc/builds-on b/examples/iterative-refinement/doc/builds-on new file mode 100644 index 00000000000..369aa997770 --- /dev/null +++ b/examples/iterative-refinement/doc/builds-on @@ -0,0 +1 @@ +simple-solver diff --git a/examples/iterative-refinement/doc/intro.dox b/examples/iterative-refinement/doc/intro.dox new file mode 100644 index 00000000000..049c0f24cc7 --- /dev/null +++ b/examples/iterative-refinement/doc/intro.dox @@ -0,0 +1,8 @@ + +

    This example shows how to use the iterative refinement solver.

    + +

    In this example, we first read in a matrix from file, then generate a +right-hand side and an initial guess. An inaccurate CG solver is used as the +inner solver to an iterative refinement (IR) method which solves a linear +system. The example features the iteration count and runtime of the IR solver. +

    diff --git a/examples/iterative-refinement/doc/kind b/examples/iterative-refinement/doc/kind new file mode 100644 index 00000000000..15a13db4511 --- /dev/null +++ b/examples/iterative-refinement/doc/kind @@ -0,0 +1 @@ +basic diff --git a/examples/iterative-refinement/doc/results.dox b/examples/iterative-refinement/doc/results.dox new file mode 100644 index 00000000000..1ee878f6e02 --- /dev/null +++ b/examples/iterative-refinement/doc/results.dox @@ -0,0 +1,19 @@ +

    Results

    +This is the expected output: + +@code{.cpp} + +Initial residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +194.679 +Final residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +4.23821e-11 +IR iteration count: 24 +IR execution time [ms]: 18.0692 + +@endcode + +

    Comments about programming and debugging

    diff --git a/examples/iterative-refinement/doc/short-intro b/examples/iterative-refinement/doc/short-intro new file mode 100644 index 00000000000..a91594e87a5 --- /dev/null +++ b/examples/iterative-refinement/doc/short-intro @@ -0,0 +1 @@ +The iterative refinement solver example. diff --git a/examples/iterative-refinement/doc/tooltip b/examples/iterative-refinement/doc/tooltip new file mode 100644 index 00000000000..852c8b02e65 --- /dev/null +++ b/examples/iterative-refinement/doc/tooltip @@ -0,0 +1 @@ +Use an iterative refinement method in Ginkgo. Solve a linear system. diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp new file mode 100644 index 00000000000..b0463db125d --- /dev/null +++ b/examples/iterative-refinement/iterative-refinement.cpp @@ -0,0 +1,149 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +#include + + +#include +#include +#include +#include + + +int main(int argc, char *argv[]) +{ + // Some shortcuts + using ValueType = double; + using IndexType = int; + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using ir = gko::solver::Ir; + + // Print version information + std::cout << gko::version_info::get() << std::endl; + + // Figure out where to run the code + std::shared_ptr exec; + if (argc == 1 || std::string(argv[1]) == "reference") { + exec = gko::ReferenceExecutor::create(); + } else if (argc == 2 && std::string(argv[1]) == "omp") { + exec = gko::OmpExecutor::create(); + } else if (argc == 2 && std::string(argv[1]) == "cuda" && + gko::CudaExecutor::get_num_devices() > 0) { + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); + } else { + std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; + std::exit(-1); + } + + // Read data + auto A = share(gko::read(std::ifstream("data/A.mtx"), exec)); + // Create RHS and initial guess as 1 + gko::size_type size = A->get_size()[0]; + auto host_x = gko::matrix::Dense::create(exec->get_master(), + gko::dim<2>(size, 1)); + for (auto i = 0; i < size; i++) { + host_x->at(i, 0) = 1.; + } + auto x = gko::matrix::Dense::create(exec); + auto b = gko::matrix::Dense::create(exec); + x->copy_from(host_x.get()); + b->copy_from(host_x.get()); + + // Calculate initial residual by overwriting b + auto one = gko::initialize({1.0}, exec); + auto neg_one = gko::initialize({-1.0}, exec); + auto initres = gko::initialize({0.0}, exec); + A->apply(lend(one), lend(x), lend(neg_one), lend(b)); + b->compute_norm2(lend(initres)); + + // copy b again + b->copy_from(host_x.get()); + gko::size_type max_iters = 10000u; + gko::remove_complex outer_reduction_factor = 1e-12; + auto iter_stop = + gko::stop::Iteration::build().with_max_iters(max_iters).on(exec); + auto tol_stop = gko::stop::ResidualNormReduction::build() + .with_reduction_factor(outer_reduction_factor) + .on(exec); + + std::shared_ptr> logger = + gko::log::Convergence::create(exec); + iter_stop->add_logger(logger); + tol_stop->add_logger(logger); + + // Create solver factory + gko::remove_complex inner_reduction_factor = 1e-2; + auto solver_gen = + ir::build() + .with_solver( + cg::build() + .with_criteria( + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(inner_reduction_factor) + .on(exec)) + .on(exec)) + .with_criteria(gko::share(iter_stop), gko::share(tol_stop)) + .on(exec); + // Create solver + auto solver = solver_gen->generate(A); + + + // Solve system + exec->synchronize(); + std::chrono::nanoseconds time(0); + auto tic = std::chrono::steady_clock::now(); + solver->apply(lend(b), lend(x)); + auto toc = std::chrono::steady_clock::now(); + time += std::chrono::duration_cast(toc - tic); + + // Calculate residual + auto res = gko::initialize({0.0}, exec); + A->apply(lend(one), lend(x), lend(neg_one), lend(b)); + b->compute_norm2(lend(res)); + + std::cout << "Initial residual norm sqrt(r^T r): \n"; + write(std::cout, lend(initres)); + std::cout << "Final residual norm sqrt(r^T r): \n"; + write(std::cout, lend(res)); + + // Print solver statistics + std::cout << "IR iteration count: " << logger->get_num_iterations() + << std::endl; + std::cout << "IR execution time [ms]: " + << static_cast(time.count()) / 1000000.0 << std::endl; +} diff --git a/examples/minimal-cuda-solver/build.sh b/examples/minimal-cuda-solver/build.sh index 0c75e6c1ef3..422db49149b 100755 --- a/examples/minimal-cuda-solver/build.sh +++ b/examples/minimal-cuda-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp index 8cabf27db58..1b47f712766 100644 --- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp +++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main() { // Instantiate a CUDA executor - auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); // Read data auto A = gko::read>(std::cin, gpu); auto b = gko::read>(std::cin, gpu); diff --git a/examples/mixed-precision-ir/CMakeLists.txt b/examples/mixed-precision-ir/CMakeLists.txt new file mode 100644 index 00000000000..e4f81ef2c55 --- /dev/null +++ b/examples/mixed-precision-ir/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(mixed-precision-ir mixed-precision-ir.cpp) +target_link_libraries(mixed-precision-ir ginkgo) +target_include_directories(mixed-precision-ir PRIVATE ${PROJECT_SOURCE_DIR}) +configure_file(data/A.mtx data/A.mtx COPYONLY) diff --git a/examples/mixed-precision-ir/build.sh b/examples/mixed-precision-ir/build.sh new file mode 100755 index 00000000000..a73ea3cde18 --- /dev/null +++ b/examples/mixed-precision-ir/build.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# set up script +if [ $# -ne 1 ]; then + echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY" + exit 1 +fi +BUILD_DIR=$1 +THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) + +# copy libraries +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" +SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" +for prefix in ${LIBRARY_DIRS}; do + for name in ${LIBRARY_NAMES}; do + for suffix in ${SUFFIXES}; do + cp ${BUILD_DIR}/${prefix}/lib${name}${suffix} \ + ${THIS_DIR}/lib${name}${suffix} 2>/dev/null + done + done +done + +# figure out correct compiler flags +if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" +else + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" +fi +if [ -z "${CXX}" ]; then + CXX="c++" +fi + +# build +${CXX} -std=c++11 -o ${THIS_DIR}/mixed-precision-ir \ + ${THIS_DIR}/mixed-precision-ir.cpp \ + -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \ + -L${THIS_DIR} ${LINK_FLAGS} diff --git a/examples/mixed-precision-ir/data/A.mtx b/examples/mixed-precision-ir/data/A.mtx new file mode 100644 index 00000000000..c67437da567 --- /dev/null +++ b/examples/mixed-precision-ir/data/A.mtx @@ -0,0 +1,114 @@ +%%MatrixMarket matrix coordinate integer symmetric +%------------------------------------------------------------------------------- +% UF Sparse Matrix Collection, Tim Davis +% http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b +% name: JGD_Trefethen/Trefethen_20b +% [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.] +% id: 2203 +% date: 2008 +% author: N. Trefethen +% ed: J.-G. Dumas +% fields: name title A id date author ed kind notes +% kind: combinatorial problem +%------------------------------------------------------------------------------- +% notes: +% Diagonal matrices with primes, Nick Trefethen, Oxford Univ. +% From Jean-Guillaume Dumas' Sparse Integer Matrix Collection, +% http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html +% +% Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems, +% SIAM News, vol 35, no. 1. +% +% 7. Let A be the 20,000 x 20,000 matrix whose entries are zero +% everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the +% main diagonal and the number 1 in all the positions A(i,j) with +% |i-j| = 1,2,4,8, . . . ,16384. What is the (1,1) entry of inv(A)? +% +% http://www.siam.org/news/news.php?id=388 +% +% Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms +%------------------------------------------------------------------------------- +19 19 83 +1 1 3 +2 1 1 +3 1 1 +5 1 1 +9 1 1 +17 1 1 +2 2 5 +3 2 1 +4 2 1 +6 2 1 +10 2 1 +18 2 1 +3 3 7 +4 3 1 +5 3 1 +7 3 1 +11 3 1 +19 3 1 +4 4 11 +5 4 1 +6 4 1 +8 4 1 +12 4 1 +5 5 13 +6 5 1 +7 5 1 +9 5 1 +13 5 1 +6 6 17 +7 6 1 +8 6 1 +10 6 1 +14 6 1 +7 7 19 +8 7 1 +9 7 1 +11 7 1 +15 7 1 +8 8 23 +9 8 1 +10 8 1 +12 8 1 +16 8 1 +9 9 29 +10 9 1 +11 9 1 +13 9 1 +17 9 1 +10 10 31 +11 10 1 +12 10 1 +14 10 1 +18 10 1 +11 11 37 +12 11 1 +13 11 1 +15 11 1 +19 11 1 +12 12 41 +13 12 1 +14 12 1 +16 12 1 +13 13 43 +14 13 1 +15 13 1 +17 13 1 +14 14 47 +15 14 1 +16 14 1 +18 14 1 +15 15 53 +16 15 1 +17 15 1 +19 15 1 +16 16 59 +17 16 1 +18 16 1 +17 17 61 +18 17 1 +19 17 1 +18 18 67 +19 18 1 +19 19 71 diff --git a/examples/mixed-precision-ir/doc/builds-on b/examples/mixed-precision-ir/doc/builds-on new file mode 100644 index 00000000000..732380a55b6 --- /dev/null +++ b/examples/mixed-precision-ir/doc/builds-on @@ -0,0 +1 @@ +iterative-refinement diff --git a/examples/mixed-precision-ir/doc/intro.dox b/examples/mixed-precision-ir/doc/intro.dox new file mode 100644 index 00000000000..167972f3768 --- /dev/null +++ b/examples/mixed-precision-ir/doc/intro.dox @@ -0,0 +1,8 @@ + +

    This example manually implements a Mixed Precision Iterative Refinement (MPIR) solver.

    + +

    In this example, we first read in a matrix from file, then generate a +right-hand side and an initial guess. An inaccurate CG solver in single precision +is used as the inner solver to an iterative refinement (IR) in double precision +method which solves a linear system. +

    diff --git a/examples/mixed-precision-ir/doc/kind b/examples/mixed-precision-ir/doc/kind new file mode 100644 index 00000000000..c1d9154931a --- /dev/null +++ b/examples/mixed-precision-ir/doc/kind @@ -0,0 +1 @@ +techniques diff --git a/examples/mixed-precision-ir/doc/results.dox b/examples/mixed-precision-ir/doc/results.dox new file mode 100644 index 00000000000..93abb0f7519 --- /dev/null +++ b/examples/mixed-precision-ir/doc/results.dox @@ -0,0 +1,19 @@ +

    Results

    +This is the expected output: + +@code{.cpp} + +Initial residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +194.679 +Final residual norm sqrt(r^T r): +%%MatrixMarket matrix array real general +1 1 +1.22728e-10 +MPIR iteration count: 25 +MPIR execution time [ms]: 18.0933 + +@endcode + +

    Comments about programming and debugging

    diff --git a/examples/mixed-precision-ir/doc/short-intro b/examples/mixed-precision-ir/doc/short-intro new file mode 100644 index 00000000000..df19909cc80 --- /dev/null +++ b/examples/mixed-precision-ir/doc/short-intro @@ -0,0 +1 @@ +The Mixed Precision Iterative Refinement (MPIR) solver example. diff --git a/examples/mixed-precision-ir/doc/tooltip b/examples/mixed-precision-ir/doc/tooltip new file mode 100644 index 00000000000..b0cce88707b --- /dev/null +++ b/examples/mixed-precision-ir/doc/tooltip @@ -0,0 +1 @@ +Manually implement a Mixed Precision Iterative Refinement (MPIR) method in Ginkgo. Solve a linear system. diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp new file mode 100644 index 00000000000..395b553f6c9 --- /dev/null +++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp @@ -0,0 +1,177 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +#include + + +#include +#include +#include +#include + + +int main(int argc, char *argv[]) +{ + // Some shortcuts + using ValueType = double; + using SolverType = float; + using IndexType = int; + using vec = gko::matrix::Dense; + using solver_vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using solver_mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + + gko::size_type max_outer_iters = 100u; + gko::size_type max_inner_iters = 100u; + gko::remove_complex outer_reduction_factor = 1e-12; + gko::remove_complex inner_reduction_factor = 1e-2; + + // Print version information + std::cout << gko::version_info::get() << std::endl; + + // Figure out where to run the code + std::shared_ptr exec; + if (argc == 1 || std::string(argv[1]) == "reference") { + exec = gko::ReferenceExecutor::create(); + } else if (argc == 2 && std::string(argv[1]) == "omp") { + exec = gko::OmpExecutor::create(); + } else if (argc == 2 && std::string(argv[1]) == "cuda" && + gko::CudaExecutor::get_num_devices() > 0) { + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); + } else { + std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; + std::exit(-1); + } + + // Read data + auto A = share(gko::read(std::ifstream("data/A.mtx"), exec)); + // Create RHS and initial guess as 1 + gko::size_type size = A->get_size()[0]; + auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1)); + for (auto i = 0; i < size; i++) { + host_x->at(i, 0) = 1.; + } + auto x = vec::create(exec); + auto b = vec::create(exec); + x->copy_from(host_x.get()); + b->copy_from(host_x.get()); + + // Calculate initial residual by overwriting b + auto one = gko::initialize({1.0}, exec); + auto neg_one = gko::initialize({-1.0}, exec); + auto initres_vec = gko::initialize({0.0}, exec); + A->apply(lend(one), lend(x), lend(neg_one), lend(b)); + b->compute_norm2(lend(initres_vec)); + + // Build lower-precision system matrix and residual + auto solver_A = solver_mtx::create(exec); + auto inner_residual = solver_vec::create(exec); + auto outer_residual = vec::create(exec); + A->convert_to(lend(solver_A)); + b->convert_to(lend(outer_residual)); + + // restore b + b->copy_from(host_x.get()); + + // Create inner solver + auto inner_solver = + cg::build() + .with_criteria(gko::stop::ResidualNormReduction::build() + .with_reduction_factor(inner_reduction_factor) + .on(exec), + gko::stop::Iteration::build() + .with_max_iters(max_inner_iters) + .on(exec)) + .on(exec) + ->generate(give(solver_A)); + + // Solve system + exec->synchronize(); + std::chrono::nanoseconds time(0); + auto res_vec = gko::initialize({0.0}, exec); + auto initres = exec->copy_val_to_host(initres_vec->get_const_values()); + auto inner_solution = solver_vec::create(exec); + auto outer_delta = vec::create(exec); + auto tic = std::chrono::steady_clock::now(); + int iter = -1; + while (true) { + ++iter; + + // convert residual to inner precision + outer_residual->convert_to(lend(inner_residual)); + outer_residual->compute_norm2(lend(res_vec)); + auto res = exec->copy_val_to_host(res_vec->get_const_values()); + + // break if we exceed the number of iterations or have converged + if (iter > max_outer_iters || res / initres < outer_reduction_factor) { + break; + } + + // Use the inner solver to solve + // A * inner_solution = inner_residual + // with residual as initial guess. + inner_solution->copy_from(lend(inner_residual)); + inner_solver->apply(lend(inner_residual), lend(inner_solution)); + + // convert inner solution to outer precision + inner_solution->convert_to(lend(outer_delta)); + + // x = x + inner_solution + x->add_scaled(lend(one), lend(outer_delta)); + + // residual = b - A * x + outer_residual->copy_from(lend(b)); + A->apply(lend(neg_one), lend(x), lend(one), lend(outer_residual)); + } + + auto toc = std::chrono::steady_clock::now(); + time += std::chrono::duration_cast(toc - tic); + + // Calculate residual + A->apply(lend(one), lend(x), lend(neg_one), lend(b)); + b->compute_norm2(lend(res_vec)); + + std::cout << "Initial residual norm sqrt(r^T r): \n"; + write(std::cout, lend(initres_vec)); + std::cout << "Final residual norm sqrt(r^T r): \n"; + write(std::cout, lend(res_vec)); + + // Print solver statistics + std::cout << "MPIR iteration count: " << iter << std::endl; + std::cout << "MPIR execution time [ms]: " + << static_cast(time.count()) / 1000000.0 << std::endl; +} diff --git a/examples/nine-pt-stencil-solver/build.sh b/examples/nine-pt-stencil-solver/build.sh old mode 100644 new mode 100755 index 6f5a4dfdb0c..79af2d5055b --- a/examples/nine-pt-stencil-solver/build.sh +++ b/examples/nine-pt-stencil-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp index 6600e7291ba..e181c9a563a 100644 --- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp +++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -83,7 +83,7 @@ constexpr double default_alpha = 10.0 / 3.0; constexpr double default_beta = -2.0 / 3.0; constexpr double default_gamma = -1.0 / 6.0; -/* Possible alternative default values are for example +/* Possible alternative default values are * default_alpha = 8.0; * default_beta = -1.0; * default_gamma = -1.0; @@ -91,18 +91,20 @@ constexpr double default_gamma = -1.0 / 6.0; // Creates a stencil matrix in CSR format for the given number of discretization // points. -void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs, - double *values, double *coefs) +template +void generate_stencil_matrix(IndexType dp, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *values, + ValueType *coefs) { - int pos = 0; + IndexType pos = 0; const size_t dp_2 = dp * dp; row_ptrs[0] = pos; - for (int k = 0; k < dp; ++k) { - for (int i = 0; i < dp; ++i) { + for (IndexType k = 0; k < dp; ++k) { + for (IndexType i = 0; i < dp; ++i) { const size_t index = i + k * dp; - for (int j = -1; j <= 1; ++j) { - for (int l = -1; l <= 1; ++l) { - const int64_t offset = l + 1 + 3 * (j + 1); + for (IndexType j = -1; j <= 1; ++j) { + for (IndexType l = -1; l <= 1; ++l) { + const IndexType offset = l + 1 + 3 * (j + 1); if ((k + j) >= 0 && (k + j) < dp && (i + l) >= 0 && (i + l) < dp) { values[pos] = coefs[offset]; @@ -118,15 +120,17 @@ void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs, // Generates the RHS vector given `f` and the boundary conditions. -template -void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) +template +void generate_rhs(IndexType dp, Closure f, ClosureT u, ValueType *rhs, + ValueType *coefs) { const size_t dp_2 = dp * dp; - const auto h = 1.0 / (dp + 1.0); - for (int i = 0; i < dp; ++i) { - const auto yi = (i + 1) * h; - for (int j = 0; j < dp; ++j) { - const auto xi = (j + 1) * h; + const ValueType h = 1.0 / (dp + 1.0); + for (IndexType i = 0; i < dp; ++i) { + const auto yi = ValueType(i + 1) * h; + for (IndexType j = 0; j < dp; ++j) { + const auto xi = ValueType(j + 1) * h; const auto index = i * dp + j; rhs[index] = -f(xi, yi) * h * h; } @@ -135,7 +139,7 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) // Iterating over the edges to add boundary values // and adding the overlapping 3x1 to the rhs for (size_t i = 0; i < dp; ++i) { - const auto xi = (i + 1) * h; + const auto xi = ValueType(i + 1) * h; const auto index_top = i; const auto index_bot = i + dp * (dp - 1); @@ -148,7 +152,7 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) rhs[index_bot] -= u(xi + h, 1.0) * coefs[8]; } for (size_t i = 0; i < dp; ++i) { - const auto yi = (i + 1) * h; + const auto yi = ValueType(i + 1) * h; const auto index_left = i * dp; const auto index_right = i * dp + (dp - 1); @@ -170,10 +174,11 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) // Prints the solution `u`. -void print_solution(int dp, const double *u) +template +void print_solution(IndexType dp, const ValueType *u) { - for (int i = 0; i < dp; ++i) { - for (int j = 0; j < dp; ++j) { + for (IndexType i = 0; i < dp; ++i) { + for (IndexType j = 0; j < dp; ++j) { std::cout << u[i * dp + j] << ' '; } std::cout << '\n'; @@ -184,16 +189,17 @@ void print_solution(int dp, const double *u) // Computes the 1-norm of the error given the computed `u` and the correct // solution function `correct_u`. -template -double calculate_error(int dp, const double *u, Closure correct_u) +template +gko::remove_complex calculate_error(IndexType dp, const ValueType *u, + Closure correct_u) { - const auto h = 1.0 / (dp + 1); - auto error = 0.0; - for (int j = 0; j < dp; ++j) { - const auto xi = (j + 1) * h; - for (int i = 0; i < dp; ++i) { + const ValueType h = 1.0 / (dp + 1); + gko::remove_complex error = 0.0; + for (IndexType j = 0; j < dp; ++j) { + const auto xi = ValueType(j + 1) * h; + for (IndexType i = 0; i < dp; ++i) { using std::abs; - const auto yi = (i + 1) * h; + const auto yi = ValueType(i + 1) * h; error += abs(u[i * dp + j] - correct_u(xi, yi)) / abs(correct_u(xi, yi)); } @@ -202,26 +208,28 @@ double calculate_error(int dp, const double *u, Closure correct_u) } +template void solve_system(const std::string &executor_string, - unsigned int discretization_points, int *row_ptrs, - int *col_idxs, double *values, double *rhs, double *u, - double accuracy) + unsigned int discretization_points, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *values, ValueType *rhs, + ValueType *u, gko::remove_complex reduction_factor) { // Some shortcuts - using vec = gko::matrix::Dense; - using mtx = gko::matrix::Csr; - using cg = gko::solver::Cg; - using bj = gko::preconditioner::Jacobi; - using val_array = gko::Array; - using idx_array = gko::Array; + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using bj = gko::preconditioner::Jacobi; + using val_array = gko::Array; + using idx_array = gko::Array; const auto &dp = discretization_points; - const size_t dp_2 = dp * dp; + const gko::size_type dp_2 = dp * dp; // Figure out where to run the code const auto omp = gko::OmpExecutor::create(); std::map> exec_map{ {"omp", omp}, - {"cuda", gko::CudaExecutor::create(0, omp)}, + {"cuda", gko::CudaExecutor::create(0, omp, true)}, + {"hip", gko::HipExecutor::create(0, omp, true)}, {"reference", gko::ReferenceExecutor::create()}}; // executor where Ginkgo will perform the computation const auto exec = exec_map.at(executor_string); // throws if not valid @@ -263,8 +271,8 @@ void solve_system(const std::string &executor_string, cg::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(dp_2).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(accuracy) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .with_preconditioner(bj::build().on(exec)) .on(exec); @@ -283,17 +291,19 @@ int main(int argc, char *argv[]) << std::endl; std::exit(-1); } + using ValueType = double; + using IndexType = int; const int discretization_points = argc >= 2 ? std::atoi(argv[1]) : 100; const auto executor_string = argc >= 3 ? argv[2] : "reference"; - const double alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha; - const double beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta; - const double gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma; + const ValueType alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha; + const ValueType beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta; + const ValueType gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma; // clang-format off - std::array coefs{ + std::array coefs{ gamma_c, beta_c, gamma_c, - beta_c, alpha_c, beta_c, + beta_c, alpha_c, beta_c, gamma_c, beta_c, gamma_c}; // clang-format on @@ -301,38 +311,45 @@ int main(int argc, char *argv[]) const size_t dp_2 = dp * dp; // problem: - auto correct_u = [](double x, double y) { return x * x * x + y * y * y; }; - auto f = [](double x, double y) { return 6 * x + 6 * y; }; + auto correct_u = [](ValueType x, ValueType y) { + return x * x * x + y * y * y; + }; + auto f = [](ValueType x, ValueType y) { + return ValueType(6) * x + ValueType(6) * y; + }; // matrix - std::vector row_ptrs(dp_2 + 1); - std::vector col_idxs((3 * dp - 2) * (3 * dp - 2)); - std::vector values((3 * dp - 2) * (3 * dp - 2)); + std::vector row_ptrs(dp_2 + 1); + std::vector col_idxs((3 * dp - 2) * (3 * dp - 2)); + std::vector values((3 * dp - 2) * (3 * dp - 2)); // right hand side - std::vector rhs(dp_2); + std::vector rhs(dp_2); // solution - std::vector u(dp_2, 0.0); + std::vector u(dp_2, 0.0); generate_stencil_matrix(dp, row_ptrs.data(), col_idxs.data(), values.data(), coefs.data()); // looking for solution u = x^3: f = 6x, u(0) = 0, u(1) = 1 generate_rhs(dp, f, correct_u, rhs.data(), coefs.data()); - auto start_time = std::chrono::steady_clock::now(); + const gko::remove_complex reduction_factor = 1e-7; + auto start_time = std::chrono::steady_clock::now(); solve_system(executor_string, dp, row_ptrs.data(), col_idxs.data(), - values.data(), rhs.data(), u.data(), 1e-12); - + values.data(), rhs.data(), u.data(), reduction_factor); auto stop_time = std::chrono::steady_clock::now(); - double runtime_duration = - std::chrono::duration_cast(stop_time - - start_time) - .count() * + auto runtime_duration = + static_cast( + std::chrono::duration_cast(stop_time - + start_time) + .count()) * 1e-6; print_solution(dp, u.data()); std::cout << "The average relative error is " - << calculate_error(dp, u.data(), correct_u) / dp_2 << std::endl; + << calculate_error(dp, u.data(), correct_u) / + static_cast>(dp_2) + << std::endl; std::cout << "The runtime is " << std::to_string(runtime_duration) << " ms" << std::endl; } diff --git a/examples/papi-logging/build.sh b/examples/papi-logging/build.sh index 77a12b42db4..050fceca7ce 100755 --- a/examples/papi-logging/build.sh +++ b/examples/papi-logging/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lpapi -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lpapi -lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lpapi -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lpapi -lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp index 2931eddff42..d4cfef40438 100644 --- a/examples/papi-logging/papi-logging.cpp +++ b/examples/papi-logging/papi-logging.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -126,9 +126,12 @@ void print_papi_counters(int eventset) int main(int argc, char *argv[]) { // Some shortcuts - using vec = gko::matrix::Dense<>; - using mtx = gko::matrix::Csr<>; - using cg = gko::solver::Cg<>; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; // Print version information std::cout << gko::version_info::get() << std::endl; @@ -141,7 +144,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); @@ -153,12 +159,13 @@ int main(int argc, char *argv[]) auto x = gko::read(std::ifstream("data/x0.mtx"), exec); // Generate solver + const gko::remove_complex reduction_factor = 1e-7; auto solver_gen = cg::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-20) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .on(exec); auto solver = solver_gen->generate(A); @@ -172,7 +179,7 @@ int main(int argc, char *argv[]) // Create a PAPI logger and add it to relevant LinOps - auto logger = gko::log::Papi<>::create( + auto logger = gko::log::Papi::create( exec, gko::log::Logger::linop_apply_completed_mask | gko::log::Logger::linop_advanced_apply_completed_mask); solver->add_logger(logger); diff --git a/examples/performance-debugging/build.sh b/examples/performance-debugging/build.sh index 35c20208d1c..70ed26bcc01 100755 --- a/examples/performance-debugging/build.sh +++ b/examples/performance-debugging/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/performance-debugging/doc/results.dox b/examples/performance-debugging/doc/results.dox index 57f7555981e..e6299ad72bf 100644 --- a/examples/performance-debugging/doc/results.dox +++ b/examples/performance-debugging/doc/results.dox @@ -35,7 +35,7 @@ Apply operations times (ns): dense::compute_dot#3: 28548 dense::compute_norm2#2: 45677 free: 25109 - residual_norm_reduction::residual_norm_reduction#9: 10617 + residual_norm::residual_norm#9: 10617 Recurrent Residual Norms: [ 4.3589 diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp index 1ed7f815a49..c5a35383447 100644 --- a/examples/performance-debugging/performance-debugging.cpp +++ b/examples/performance-debugging/performance-debugging.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -70,14 +70,14 @@ std::unique_ptr> create_vector( // utilities for computing norms and residuals template -double get_norm(const vec *norm) +gko::remove_complex get_norm(const vec *norm) { - return clone(norm->get_executor()->get_master(), norm)->at(0, 0); + return std::real(clone(norm->get_executor()->get_master(), norm)->at(0, 0)); } template -double compute_norm(const vec *b) +gko::remove_complex compute_norm(const vec *b) { auto exec = b->get_executor(); auto b_norm = gko::initialize>({0.0}, exec); @@ -87,8 +87,9 @@ double compute_norm(const vec *b) template -double compute_residual_norm(const gko::LinOp *system_matrix, - const vec *b, const vec *x) +gko::remove_complex compute_residual_norm( + const gko::LinOp *system_matrix, const vec *b, + const vec *x) { auto exec = system_matrix->get_executor(); auto one = gko::initialize>({1.0}, exec); @@ -324,7 +325,8 @@ void print_usage(const char *filename) } -void print_vector(const gko::matrix::Dense<> *vec) +template +void print_vector(const gko::matrix::Dense *vec) { auto elements_to_print = std::min(gko::size_type(10), vec->get_size()[0]); std::cout << "[" << std::endl; @@ -342,23 +344,22 @@ int main(int argc, char *argv[]) { // Parametrize the benchmark here // Pick a value type - using vtype = double; + using ValueType = double; + using IndexType = int; // Pick a matrix format - using mtx = gko::matrix::Csr; + using mtx = gko::matrix::Csr; // Pick a solver - using solver = gko::solver::Cg; + using solver = gko::solver::Cg; // Pick a preconditioner type - using preconditioner = gko::matrix::IdentityFactory; + using preconditioner = gko::matrix::IdentityFactory; // Pick a residual norm reduction value - auto reduction_factor = 1e-8; - // Pick a maximum iteration count - auto max_iters = 2000u; + const gko::remove_complex reduction_factor = 1e-12; // Pick an output file name - auto of_name = "log.txt"; + const auto of_name = "log.txt"; // Simple shortcut - using vec = gko::matrix::Dense; + using vec = gko::matrix::Dense; // Print version information std::cout << gko::version_info::get() << std::endl; @@ -371,7 +372,7 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc > 1 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); } else { print_usage(argv[0]); } @@ -392,16 +393,18 @@ int main(int argc, char *argv[]) // Remove the storage logger exec->remove_logger(gko::lend(storage_logger)); + // Pick a maximum iteration count + const auto max_iters = A->get_size()[0]; // Generate b and x vectors - auto b = utils::create_vector(exec, A->get_size()[0], 1.0); - auto x = utils::create_vector(exec, A->get_size()[0], 0.0); + auto b = utils::create_vector(exec, A->get_size()[0], 1.0); + auto x = utils::create_vector(exec, A->get_size()[0], 0.0); // Declare the solver factory. The preconditioner's arguments should be // adapted if needed. auto solver_factory = solver::build() .with_criteria( - gko::stop::ResidualNormReduction::build() + gko::stop::ResidualNormReduction::build() .with_reduction_factor(reduction_factor) .on(exec), gko::stop::Iteration::build().with_max_iters(max_iters).on( @@ -461,9 +464,6 @@ int main(int argc, char *argv[]) // Log the internal operations using the OperationLogger without timing { - // Clone x to not overwrite the original one - auto x_clone = gko::clone(x); - // Create an OperationLogger to analyze the generate step auto gen_logger = std::make_shared(exec); // Add the generate logger to the executor @@ -480,11 +480,11 @@ int main(int argc, char *argv[]) auto apply_logger = std::make_shared(exec); exec->add_logger(apply_logger); // Create a ResidualLogger to log the recurent residual - auto res_logger = std::make_shared>( + auto res_logger = std::make_shared>( exec, gko::lend(A), gko::lend(b)); generated_solver->add_logger(res_logger); // Solve the system - generated_solver->apply(gko::lend(b), gko::lend(x_clone)); + generated_solver->apply(gko::lend(b), gko::lend(x)); exec->remove_logger(gko::lend(apply_logger)); // Write the data to the output file output_file << "Apply operations times (ns):" << std::endl; diff --git a/examples/poisson-solver/build.sh b/examples/poisson-solver/build.sh index 6a0a0c40515..09ecd8ce987 100755 --- a/examples/poisson-solver/build.sh +++ b/examples/poisson-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp index 0539b2f0ac6..6abb2a52560 100644 --- a/examples/poisson-solver/poisson-solver.cpp +++ b/examples/poisson-solver/poisson-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,14 +39,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Creates a stencil matrix in CSR format for the given number of discretization // points. -void generate_stencil_matrix(gko::matrix::Csr<> *matrix) +template +void generate_stencil_matrix(gko::matrix::Csr *matrix) { const auto discretization_points = matrix->get_size()[0]; auto row_ptrs = matrix->get_row_ptrs(); auto col_idxs = matrix->get_col_idxs(); auto values = matrix->get_values(); int pos = 0; - const double coefs[] = {-1, 2, -1}; + const ValueType coefs[] = {-1, 2, -1}; row_ptrs[0] = pos; for (int i = 0; i < discretization_points; ++i) { for (auto ofs : {-1, 0, 1}) { @@ -62,14 +63,15 @@ void generate_stencil_matrix(gko::matrix::Csr<> *matrix) // Generates the RHS vector given `f` and the boundary conditions. -template -void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs) +template +void generate_rhs(Closure f, ValueType u0, ValueType u1, + gko::matrix::Dense *rhs) { const auto discretization_points = rhs->get_size()[0]; auto values = rhs->get_values(); - const auto h = 1.0 / (discretization_points + 1); - for (int i = 0; i < discretization_points; ++i) { - const auto xi = (i + 1) * h; + const ValueType h = 1.0 / static_cast(discretization_points + 1); + for (gko::size_type i = 0; i < discretization_points; ++i) { + const auto xi = static_cast(i + 1) * h; values[i] = -f(xi) * h * h; } values[0] += u0; @@ -78,7 +80,9 @@ void generate_rhs(Closure f, double u0, double u1, gko::matrix::Dense<> *rhs) // Prints the solution `u`. -void print_solution(double u0, double u1, const gko::matrix::Dense<> *u) +template +void print_solution(ValueType u0, ValueType u1, + const gko::matrix::Dense *u) { std::cout << u0 << '\n'; for (int i = 0; i < u->get_size()[0]; ++i) { @@ -90,15 +94,16 @@ void print_solution(double u0, double u1, const gko::matrix::Dense<> *u) // Computes the 1-norm of the error given the computed `u` and the correct // solution function `correct_u`. -template -double calculate_error(int discretization_points, const gko::matrix::Dense<> *u, - Closure correct_u) +template +gko::remove_complex calculate_error( + int discretization_points, const gko::matrix::Dense *u, + Closure correct_u) { - const auto h = 1.0 / (discretization_points + 1); + const ValueType h = 1.0 / static_cast(discretization_points + 1); auto error = 0.0; for (int i = 0; i < discretization_points; ++i) { using std::abs; - const auto xi = (i + 1) * h; + const auto xi = static_cast(i + 1) * h; error += abs(u->get_const_values()[i] - correct_u(xi)) / abs(correct_u(xi)); } @@ -109,10 +114,13 @@ double calculate_error(int discretization_points, const gko::matrix::Dense<> *u, int main(int argc, char *argv[]) { // Some shortcuts - using vec = gko::matrix::Dense; - using mtx = gko::matrix::Csr; - using cg = gko::solver::Cg; - using bj = gko::preconditioner::Jacobi<>; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using bj = gko::preconditioner::Jacobi; if (argc < 2) { std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]" @@ -129,7 +137,8 @@ int main(int argc, char *argv[]) const auto omp = gko::OmpExecutor::create(); std::map> exec_map{ {"omp", omp}, - {"cuda", gko::CudaExecutor::create(0, omp)}, + {"cuda", gko::CudaExecutor::create(0, omp, true)}, + {"hip", gko::HipExecutor::create(0, omp, true)}, {"reference", gko::ReferenceExecutor::create()}}; // executor where Ginkgo will perform the computation @@ -138,8 +147,8 @@ int main(int argc, char *argv[]) const auto app_exec = exec_map["omp"]; // problem: - auto correct_u = [](double x) { return x * x * x; }; - auto f = [](double x) { return 6 * x; }; + auto correct_u = [](ValueType x) { return x * x * x; }; + auto f = [](ValueType x) { return ValueType(6) * x; }; auto u0 = correct_u(0); auto u1 = correct_u(1); @@ -154,22 +163,24 @@ int main(int argc, char *argv[]) u->get_values()[i] = 0.0; } + const gko::remove_complex reduction_factor = 1e-7; // Generate solver and solve the system cg::build() .with_criteria(gko::stop::Iteration::build() .with_max_iters(discretization_points) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-6) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .with_preconditioner(bj::build().on(exec)) .on(exec) ->generate(clone(exec, matrix)) // copy the matrix to the executor ->apply(lend(rhs), lend(u)); - print_solution(u0, u1, lend(u)); + print_solution(u0, u1, lend(u)); std::cout << "The average relative error is " << calculate_error(discretization_points, lend(u), correct_u) / - discretization_points + static_cast>( + discretization_points) << std::endl; } diff --git a/examples/preconditioned-solver/build.sh b/examples/preconditioned-solver/build.sh index efbbb19aadc..5022e339ae5 100755 --- a/examples/preconditioned-solver/build.sh +++ b/examples/preconditioned-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp index db771e94af8..531f803fd80 100644 --- a/examples/preconditioned-solver/preconditioned-solver.cpp +++ b/examples/preconditioned-solver/preconditioned-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,10 +42,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main(int argc, char *argv[]) { // Some shortcuts - using vec = gko::matrix::Dense<>; - using mtx = gko::matrix::Csr<>; - using cg = gko::solver::Cg<>; - using bj = gko::preconditioner::Jacobi<>; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using bj = gko::preconditioner::Jacobi; // Print version information std::cout << gko::version_info::get() << std::endl; @@ -58,7 +61,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); @@ -69,13 +75,14 @@ int main(int argc, char *argv[]) auto b = gko::read(std::ifstream("data/b.mtx"), exec); auto x = gko::read(std::ifstream("data/x0.mtx"), exec); + const gko::remove_complex reduction_factor = 1e-7; // Create solver factory auto solver_gen = cg::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-20) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) // Add preconditioner, these 2 lines are the only // difference from the simple solver example diff --git a/examples/simple-solver-logging/build.sh b/examples/simple-solver-logging/build.sh index 5062f0faf03..0a664512469 100755 --- a/examples/simple-solver-logging/build.sh +++ b/examples/simple-solver-logging/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/simple-solver-logging/doc/results.dox b/examples/simple-solver-logging/doc/results.dox index dbb01d1a63c..98dffea7d6a 100644 --- a/examples/simple-solver-logging/doc/results.dox +++ b/examples/simple-solver-logging/doc/results.dox @@ -49,9 +49,9 @@ gko::ReferenceExecutor,0x55ae09d8f2a0] [LOG] >>> check started for stop::Criterion[gko::stop::ResidualNormReduction,0x55ae09d99260] at iteration 0 with ID 1 and finalized set to 1 [LOG] >>> Operation[gko::matrix::dense::compute_norm2_operation const*, gko::matrix::Dense*>,0x7ffcab765740] started on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0] [LOG] >>> Operation[gko::matrix::dense::compute_norm2_operation const*, gko::matrix::Dense*>,0x7ffcab765740] completed on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0] -[LOG] >>> Operation[gko::stop::residual_norm_reduction::residual_norm_reduction_operation const*&, gko::matrix::Dense*, double&, unsigned char&, bool&, gko::Array*&, gko::Array*, bool*, bool*&>,0x7ffcab765980] +[LOG] >>> Operation[gko::stop::residual_norm::residual_norm_operation const*&, gko::matrix::Dense*, double&, unsigned char&, bool&, gko::Array*&, gko::Array*, bool*, bool*&>,0x7ffcab765980] started on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0] -[LOG] >>> Operation[gko::stop::residual_norm_reduction::residual_norm_reduction_operation const*&, gko::matrix::Dense*, double&, unsigned char&, bool&, gko::Array*&, gko::Array*, bool*, bool*&>,0x7ffcab765980] +[LOG] >>> Operation[gko::stop::residual_norm::residual_norm_operation const*&, gko::matrix::Dense*, double&, unsigned char&, bool&, gko::Array*&, gko::Array*, bool*, bool*&>,0x7ffcab765980] completed on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0] [LOG] >>> check completed for stop::Criterion[gko::stop::ResidualNormReduction,0x55ae09d99260] at iteration 0 with ID 1 and finalized set to 1. It changed one RHS 0, stopped the iteration process 0 [LOG] >>> allocation started on Executor[gko::ReferenceExecutor,0x55ae09d8f2a0] with Bytes[152] diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp index 2fccdb65de9..3e48bcdacda 100644 --- a/examples/simple-solver-logging/simple-solver-logging.cpp +++ b/examples/simple-solver-logging/simple-solver-logging.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,7 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace { -void print_vector(const std::string &name, const gko::matrix::Dense<> *vec) +template +void print_vector(const std::string &name, + const gko::matrix::Dense *vec) { std::cout << name << " = [" << std::endl; for (int i = 0; i < vec->get_size()[0]; ++i) { @@ -58,9 +60,12 @@ void print_vector(const std::string &name, const gko::matrix::Dense<> *vec) int main(int argc, char *argv[]) { // Some shortcuts - using vec = gko::matrix::Dense<>; - using mtx = gko::matrix::Csr<>; - using cg = gko::solver::Cg<>; + using ValueType = double; + using IndexType = int; + + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; // Print version information std::cout << gko::version_info::get() << std::endl; @@ -73,7 +78,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); @@ -88,8 +96,8 @@ int main(int argc, char *argv[]) // file. We log all events except for all linop factory and polymorphic // object events. Events masks are group of events which are provided // for convenience. - std::shared_ptr> stream_logger = - gko::log::Stream<>::create( + std::shared_ptr> stream_logger = + gko::log::Stream::create( exec, gko::log::Logger::all_events_mask ^ gko::log::Logger::linop_factory_events_mask ^ @@ -102,11 +110,13 @@ int main(int argc, char *argv[]) // Add stream_logger only to the ResidualNormReduction criterion Factory // Note that the logger will get automatically propagated to every criterion // generated from this factory. + const gko::remove_complex reduction_factor = 1e-7; using ResidualCriterionFactory = - gko::stop::ResidualNormReduction<>::Factory; + gko::stop::ResidualNormReduction::Factory; std::shared_ptr residual_criterion = - ResidualCriterionFactory::create().with_reduction_factor(1e-20).on( - exec); + ResidualCriterionFactory::create() + .with_reduction_factor(reduction_factor) + .on(exec); residual_criterion->add_logger(stream_logger); // Generate solver @@ -124,7 +134,7 @@ int main(int argc, char *argv[]) // gko::log::Logger::iteration_complete_mask. See the documentation of // Logger class for more information. std::ofstream filestream("my_file.txt"); - solver->add_logger(gko::log::Stream<>::create( + solver->add_logger(gko::log::Stream::create( exec, gko::log::Logger::all_events_mask, filestream)); solver->add_logger(stream_logger); @@ -153,7 +163,7 @@ int main(int argc, char *argv[]) // convergence happened) auto residual = record_logger->get().criterion_check_completed.back()->residual.get(); - auto residual_d = gko::as>(residual); + auto residual_d = gko::as(residual); print_vector("Residual", residual_d); // Print solution diff --git a/examples/simple-solver/build.sh b/examples/simple-solver/build.sh index dd4dd0fd710..f2c94bc239c 100755 --- a/examples/simple-solver/build.sh +++ b/examples/simple-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/simple-solver/doc/intro.dox b/examples/simple-solver/doc/intro.dox index 2869e91a12c..70bc1ce3cc7 100644 --- a/examples/simple-solver/doc/intro.dox +++ b/examples/simple-solver/doc/intro.dox @@ -7,13 +7,13 @@ change the parameters and see what is best suited for your purposes.

    About the example

    Each example has the following sections:
      -
    1. Introduction:This gives an overview of the example and mentions - any interesting aspects in the example that might help the reader. -
    2. The commented program: This section is intended for you to - understand the details of the example so that you can play with it and understand - Ginkgo and its features better. -
    3. Results: This section shows the results of the code when run. Though the - results may not be completely the same, you can expect the behaviour to be similar. -
    4. The plain program: This is the complete code without any comments to have - an complete overview of the code. -
    +
  • Introduction:This gives an overview of the example and mentions + any interesting aspects in the example that might help the reader. +
  • The commented program: This section is intended for you to + understand the details of the example so that you can play with it and understand + Ginkgo and its features better. +
  • Results: This section shows the results of the code when run. Though the + results may not be completely the same, you can expect the behaviour to be similar. +
  • The plain program: This is the complete code without any comments to have + an complete overview of the code. + diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp index 0026e2bcacc..ad1f43b69f2 100644 --- a/examples/simple-solver/simple-solver.cpp +++ b/examples/simple-solver/simple-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -49,14 +49,16 @@ int main(int argc, char *argv[]) // with one column/one row. The advantage of this concept is that using // multiple vectors is a now a natural extension of adding columns/rows are // necessary. - using vec = gko::matrix::Dense<>; + using ValueType = double; + using IndexType = int; + using vec = gko::matrix::Dense; // The gko::matrix::Csr class is used here, but any other matrix class such // as gko::matrix::Coo, gko::matrix::Hybrid, gko::matrix::Ell or // gko::matrix::Sellp could also be used. - using mtx = gko::matrix::Csr<>; + using mtx = gko::matrix::Csr; // The gko::solver::Cg is used here, but any other solver class can also be // used. - using cg = gko::solver::Cg<>; + using cg = gko::solver::Cg; // Print the ginkgo version information. std::cout << gko::version_info::get() << std::endl; @@ -78,7 +80,10 @@ int main(int argc, char *argv[]) exec = gko::OmpExecutor::create(); } else if (argc == 2 && std::string(argv[1]) == "cuda" && gko::CudaExecutor::get_num_devices() > 0) { - exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + exec = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + } else if (argc == 2 && std::string(argv[1]) == "hip" && + gko::HipExecutor::get_num_devices() > 0) { + exec = gko::HipExecutor::create(0, gko::OmpExecutor::create(), true); } else { std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl; std::exit(-1); @@ -105,12 +110,13 @@ int main(int argc, char *argv[]) // criteria(gko::stop) are also generated from factories using their build // methods. You need to specify the executors which each of the object needs // to be built on. + const gko::remove_complex reduction_factor = 1e-7; auto solver_gen = cg::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .on(exec); // Generate the solver from the matrix. The solver factory built in the diff --git a/examples/three-pt-stencil-solver/build.sh b/examples/three-pt-stencil-solver/build.sh index 3594d40eda1..882e9c22bdf 100755 --- a/examples/three-pt-stencil-solver/build.sh +++ b/examples/three-pt-stencil-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp index 339f4239519..504278d0ed8 100644 --- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp +++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -78,13 +78,15 @@ use Ginkgo, and the only part where Ginkgo is introduced is inside the // Creates a stencil matrix in CSR format for the given number of discretization // points. -void generate_stencil_matrix(int discretization_points, int *row_ptrs, - int *col_idxs, double *values) +template +void generate_stencil_matrix(IndexType discretization_points, + IndexType *row_ptrs, IndexType *col_idxs, + ValueType *values) { - int pos = 0; - const double coefs[] = {-1, 2, -1}; + IndexType pos = 0; + const ValueType coefs[] = {-1, 2, -1}; row_ptrs[0] = pos; - for (int i = 0; i < discretization_points; ++i) { + for (IndexType i = 0; i < discretization_points; ++i) { for (auto ofs : {-1, 0, 1}) { if (0 <= i + ofs && i + ofs < discretization_points) { values[pos] = coefs[ofs + 1]; @@ -98,13 +100,13 @@ void generate_stencil_matrix(int discretization_points, int *row_ptrs, // Generates the RHS vector given `f` and the boundary conditions. -template -void generate_rhs(int discretization_points, Closure f, double u0, double u1, - double *rhs) +template +void generate_rhs(IndexType discretization_points, Closure f, ValueType u0, + ValueType u1, ValueType *rhs) { - const auto h = 1.0 / (discretization_points + 1); - for (int i = 0; i < discretization_points; ++i) { - const auto xi = (i + 1) * h; + const ValueType h = 1.0 / (discretization_points + 1); + for (IndexType i = 0; i < discretization_points; ++i) { + const ValueType xi = ValueType(i + 1) * h; rhs[i] = -f(xi) * h * h; } rhs[0] += u0; @@ -113,11 +115,12 @@ void generate_rhs(int discretization_points, Closure f, double u0, double u1, // Prints the solution `u`. -void print_solution(int discretization_points, double u0, double u1, - const double *u) +template +void print_solution(IndexType discretization_points, ValueType u0, ValueType u1, + const ValueType *u) { std::cout << u0 << '\n'; - for (int i = 0; i < discretization_points; ++i) { + for (IndexType i = 0; i < discretization_points; ++i) { std::cout << u[i] << '\n'; } std::cout << u1 << std::endl; @@ -126,40 +129,42 @@ void print_solution(int discretization_points, double u0, double u1, // Computes the 1-norm of the error given the computed `u` and the correct // solution function `correct_u`. -template -double calculate_error(int discretization_points, const double *u, - Closure correct_u) +template +gko::remove_complex calculate_error(IndexType discretization_points, + const ValueType *u, + Closure correct_u) { - const auto h = 1.0 / (discretization_points + 1); - auto error = 0.0; - for (int i = 0; i < discretization_points; ++i) { + const ValueType h = 1.0 / (discretization_points + 1); + gko::remove_complex error = 0.0; + for (IndexType i = 0; i < discretization_points; ++i) { using std::abs; - const auto xi = (i + 1) * h; + const ValueType xi = ValueType(i + 1) * h; error += abs(u[i] - correct_u(xi)) / abs(correct_u(xi)); } return error; } - +template void solve_system(const std::string &executor_string, - unsigned int discretization_points, int *row_ptrs, - int *col_idxs, double *values, double *rhs, double *u, - double accuracy) + IndexType discretization_points, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *values, ValueType *rhs, + ValueType *u, gko::remove_complex reduction_factor) { // Some shortcuts - using vec = gko::matrix::Dense; - using mtx = gko::matrix::Csr; - using cg = gko::solver::Cg; - using bj = gko::preconditioner::Jacobi; - using val_array = gko::Array; - using idx_array = gko::Array; + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using bj = gko::preconditioner::Jacobi; + using val_array = gko::Array; + using idx_array = gko::Array; const auto &dp = discretization_points; // Figure out where to run the code const auto omp = gko::OmpExecutor::create(); std::map> exec_map{ {"omp", omp}, - {"cuda", gko::CudaExecutor::create(0, omp)}, + {"cuda", gko::CudaExecutor::create(0, omp, true)}, + {"hip", gko::HipExecutor::create(0, omp, true)}, {"reference", gko::ReferenceExecutor::create()}}; // executor where Ginkgo will perform the computation const auto exec = exec_map.at(executor_string); // throws if not valid @@ -198,11 +203,12 @@ void solve_system(const std::string &executor_string, // Generate solver auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(dp).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(accuracy) - .on(exec)) + .with_criteria(gko::stop::Iteration::build() + .with_max_iters(gko::size_type(dp)) + .on(exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) + .on(exec)) .with_preconditioner(bj::build().on(exec)) .on(exec); auto solver = solver_gen->generate(gko::give(matrix)); @@ -214,29 +220,34 @@ void solve_system(const std::string &executor_string, int main(int argc, char *argv[]) { + using ValueType = double; + using IndexType = int; + if (argc < 2) { std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]" << std::endl; std::exit(-1); } - const int discretization_points = argc >= 2 ? std::atoi(argv[1]) : 100; + const IndexType discretization_points = + argc >= 2 ? std::atoi(argv[1]) : 100; const auto executor_string = argc >= 3 ? argv[2] : "reference"; // problem: - auto correct_u = [](double x) { return x * x * x; }; - auto f = [](double x) { return 6 * x; }; + auto correct_u = [](ValueType x) { return x * x * x; }; + auto f = [](ValueType x) { return ValueType(6) * x; }; auto u0 = correct_u(0); auto u1 = correct_u(1); // matrix - std::vector row_ptrs(discretization_points + 1); - std::vector col_idxs(3 * discretization_points - 2); - std::vector values(3 * discretization_points - 2); + std::vector row_ptrs(discretization_points + 1); + std::vector col_idxs(3 * discretization_points - 2); + std::vector values(3 * discretization_points - 2); // right hand side - std::vector rhs(discretization_points); + std::vector rhs(discretization_points); // solution - std::vector u(discretization_points, 0.0); + std::vector u(discretization_points, 0.0); + const gko::remove_complex reduction_factor = 1e-7; generate_stencil_matrix(discretization_points, row_ptrs.data(), col_idxs.data(), values.data()); @@ -244,9 +255,10 @@ int main(int argc, char *argv[]) generate_rhs(discretization_points, f, u0, u1, rhs.data()); solve_system(executor_string, discretization_points, row_ptrs.data(), - col_idxs.data(), values.data(), rhs.data(), u.data(), 1e-12); + col_idxs.data(), values.data(), rhs.data(), u.data(), + reduction_factor); - print_solution(discretization_points, 0, 1, u.data()); + print_solution(discretization_points, 0, 1, u.data()); std::cout << "The average relative error is " << calculate_error(discretization_points, u.data(), correct_u) / discretization_points diff --git a/examples/twentyseven-pt-stencil-solver/build.sh b/examples/twentyseven-pt-stencil-solver/build.sh old mode 100644 new mode 100755 index d38c973164b..f4d33aa2d37 --- a/examples/twentyseven-pt-stencil-solver/build.sh +++ b/examples/twentyseven-pt-stencil-solver/build.sh @@ -9,8 +9,8 @@ BUILD_DIR=$1 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd ) # copy libraries -LIBRARY_DIRS="core core/device_hooks reference omp cuda" -LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda" +LIBRARY_DIRS="core core/device_hooks reference omp cuda hip" +LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" for prefix in ${LIBRARY_DIRS}; do for name in ${LIBRARY_NAMES}; do @@ -23,9 +23,9 @@ done # figure out correct compiler flags if ls ${THIS_DIR} | grep -F "libginkgo." >/dev/null; then - LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference" + LINK_FLAGS="-lginkgo -lginkgo_omp -lginkgo_cuda -lginkgo_reference -lginkgo_hip" else - LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced" + LINK_FLAGS="-lginkgod -lginkgo_ompd -lginkgo_cudad -lginkgo_referenced -lginkgo_hipd" fi if [ -z "${CXX}" ]; then CXX="c++" diff --git a/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp b/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp index f319ed35513..75fd314ebdf 100644 --- a/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp +++ b/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -93,21 +93,20 @@ constexpr double default_delta = -1.0 / 24.0; // Creates a stencil matrix in CSR format for the given number of discretization // points. -void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs, - double *values, double *coefs) +template +void generate_stencil_matrix(IndexType dp, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *values, + ValueType *coefs) { - int pos = 0; - size_t dp_2 = dp * dp; - - + IndexType pos = 0; row_ptrs[0] = pos; for (int64_t z = 0; z < dp; ++z) { for (int64_t y = 0; y < dp; ++y) { for (int64_t x = 0; x < dp; ++x) { const auto index = x + dp * (y + dp * z); - for (int k = -1; k <= 1; ++k) { - for (int j = -1; j <= 1; ++j) { - for (int i = -1; i <= 1; ++i) { + for (IndexType k = -1; k <= 1; ++k) { + for (IndexType j = -1; j <= 1; ++j) { + for (IndexType i = -1; i <= 1; ++i) { const int64_t offset = i + 1 + 3 * (j + 1 + 3 * (k + 1)); if ((x + i) >= 0 && (x + i) < dp && (y + j) >= 0 && @@ -127,17 +126,18 @@ void generate_stencil_matrix(int dp, int *row_ptrs, int *col_idxs, // Generates the RHS vector given `f` and the boundary conditions. -template -void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) +template +void generate_rhs(IndexType dp, Closure f, ClosureT u, ValueType *rhs, + ValueType *coefs) { - const size_t dp_2 = dp * dp; - const auto h = 1.0 / (dp + 1.0); + const ValueType h = 1.0 / (dp + 1.0); for (size_t k = 0; k < dp; ++k) { - const auto zi = (k + 1) * h; + const auto zi = ValueType(k + 1) * h; for (size_t j = 0; j < dp; ++j) { - const auto yi = (j + 1) * h; + const auto yi = ValueType(j + 1) * h; for (size_t i = 0; i < dp; ++i) { - const auto xi = (i + 1) * h; + const auto xi = ValueType(i + 1) * h; const auto index = i + dp * (j + dp * k); rhs[index] = -f(xi, yi, zi) * h * h; } @@ -150,17 +150,18 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) // z - ortho to front, back for (size_t j = 0; j < dp; ++j) { for (size_t k = 0; k < dp; ++k) { - const auto yi = (j + 1) * h; - const auto zi = (k + 1) * h; + const auto yi = ValueType(j + 1) * h; + const auto zi = ValueType(k + 1) * h; const auto index_left = dp * j + dp * dp * k; const auto index_right = dp * j + dp * dp * k + (dp - 1); - for (int b = -1; b <= 1; ++b) { - for (int c = -1; c <= 1; ++c) { - rhs[index_left] -= u(0.0, yi + b * h, zi + c * h) * - coefs[3 * (b + 1) + 3 * 3 * (c + 1)]; + for (IndexType b = -1; b <= 1; ++b) { + for (IndexType c = -1; c <= 1; ++c) { + rhs[index_left] -= + u(0.0, yi + ValueType(b) * h, zi + ValueType(c) * h) * + coefs[3 * (b + 1) + 3 * 3 * (c + 1)]; rhs[index_right] -= - u(1.0, yi + b * h, zi + c * h) * + u(1.0, yi + ValueType(b) * h, zi + ValueType(c) * h) * coefs[3 * (b + 1) + 3 * 3 * (c + 1) + 2]; } } @@ -171,18 +172,20 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) // included this case for (size_t i = 0; i < dp; ++i) { for (size_t k = 0; k < dp; ++k) { - const auto xi = (i + 1) * h; - const auto zi = (k + 1) * h; + const auto xi = ValueType(i + 1) * h; + const auto zi = ValueType(k + 1) * h; const auto index_top = i + dp * dp * k; const auto index_bot = i + dp * dp * k + dp * (dp - 1); - for (int a = -1; a <= 1; ++a) { + for (IndexType a = -1; a <= 1; ++a) { if ((i < (dp - 1) || a < 1) && (i > 0 || a > -1)) { - for (int c = -1; c <= 1; ++c) { - rhs[index_top] -= u(xi + a * h, 0.0, zi + c * h) * + for (IndexType c = -1; c <= 1; ++c) { + rhs[index_top] -= u(xi + ValueType(a) * h, 0.0, + zi + ValueType(c) * h) * coefs[(a + 1) + 3 * 3 * (c + 1)]; rhs[index_bot] -= - u(xi + a * h, 1.0, zi + c * h) * + u(xi + ValueType(a) * h, 1.0, + zi + ValueType(c) * h) * coefs[(a + 1) + 3 * 3 * (c + 1) + 3 * 2]; } } @@ -193,19 +196,21 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) // Now every side has to be checked for (size_t i = 0; i < dp; ++i) { for (size_t j = 0; j < dp; ++j) { - const auto xi = (i + 1) * h; - const auto yi = (j + 1) * h; + const auto xi = ValueType(i + 1) * h; + const auto yi = ValueType(j + 1) * h; const auto index_front = i + dp * j; const auto index_back = i + dp * j + dp * dp * (dp - 1); - for (int a = -1; a <= 1; ++a) { + for (IndexType a = -1; a <= 1; ++a) { if ((i < (dp - 1) || a < 1) && (i > 0 || a > -1)) { - for (int b = -1; b <= 1; ++b) { + for (IndexType b = -1; b <= 1; ++b) { if ((j < (dp - 1) || b < 1) && (j > 0 || j > -1)) { - rhs[index_front] -= u(xi + a * h, yi + b * h, 0.0) * + rhs[index_front] -= u(xi + ValueType(a) * h, + yi + ValueType(b) * h, 0.0) * coefs[(a + 1) + 3 * (b + 1)]; rhs[index_back] -= - u(xi + a * h, yi + b * h, 1.0) * + u(xi + ValueType(a) * h, yi + ValueType(b) * h, + 1.0) * coefs[(a + 1) + 3 * (b + 1) + 3 * 3 * 2]; } } @@ -217,7 +222,8 @@ void generate_rhs(int dp, Closure f, ClosureT u, double *rhs, double *coefs) // Prints the solution `u`. -void print_solution(int dp, const double *u) +template +void print_solution(IndexType dp, const ValueType *u) { for (size_t k = 0; k < dp; ++k) { for (size_t j = 0; j < dp; ++j) { @@ -234,18 +240,19 @@ void print_solution(int dp, const double *u) // Computes the 1-norm of the error given the computed `u` and the correct // solution function `correct_u`. -template -double calculate_error(int dp, const double *u, Closure correct_u) +template +gko::remove_complex calculate_error(IndexType dp, const ValueType *u, + Closure correct_u) { using std::abs; const auto h = 1.0 / (dp + 1); - auto error = 0.0; - for (int k = 0; k < dp; ++k) { - const auto zi = (k + 1) * h; - for (int j = 0; j < dp; ++j) { - const auto yi = (j + 1) * h; - for (int i = 0; i < dp; ++i) { - const auto xi = (i + 1) * h; + gko::remove_complex error = 0.0; + for (IndexType k = 0; k < dp; ++k) { + const auto zi = ValueType(k + 1) * h; + for (IndexType j = 0; j < dp; ++j) { + const auto yi = ValueType(j + 1) * h; + for (IndexType i = 0; i < dp; ++i) { + const auto xi = ValueType(i + 1) * h; error += abs(u[k * dp * dp + i * dp + j] - correct_u(xi, yi, zi)) / abs(correct_u(xi, yi, zi)); @@ -256,27 +263,28 @@ double calculate_error(int dp, const double *u, Closure correct_u) } +template void solve_system(const std::string &executor_string, - unsigned int discretization_points, int *row_ptrs, - int *col_idxs, double *values, double *rhs, double *u, - double accuracy) + IndexType discretization_points, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *values, ValueType *rhs, + ValueType *u, gko::remove_complex reduction_factor) { // Some shortcuts - using vec = gko::matrix::Dense; - using mtx = gko::matrix::Csr; - using cg = gko::solver::Cg; - using bj = gko::preconditioner::Jacobi; - using val_array = gko::Array; - using idx_array = gko::Array; + using vec = gko::matrix::Dense; + using mtx = gko::matrix::Csr; + using cg = gko::solver::Cg; + using bj = gko::preconditioner::Jacobi; + using val_array = gko::Array; + using idx_array = gko::Array; const auto &dp = discretization_points; - const size_t dp_2 = dp * dp; const size_t dp_3 = dp * dp * dp; // Figure out where to run the code const auto omp = gko::OmpExecutor::create(); std::map> exec_map{ {"omp", omp}, - {"cuda", gko::CudaExecutor::create(0, omp)}, + {"cuda", gko::CudaExecutor::create(0, omp, true)}, + {"hip", gko::HipExecutor::create(0, omp, true)}, {"reference", gko::ReferenceExecutor::create()}}; // executor where Ginkgo will perform the computation const auto exec = exec_map.at(executor_string); // throws if not valid @@ -320,8 +328,8 @@ void solve_system(const std::string &executor_string, cg::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(dp_3).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(accuracy) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(reduction_factor) .on(exec)) .with_preconditioner(bj::build().on(exec)) .on(exec); @@ -333,6 +341,8 @@ void solve_system(const std::string &executor_string, int main(int argc, char *argv[]) { + using ValueType = double; + using IndexType = int; if (argc < 2) { std::cerr << "Usage: " << argv[0] << " DISCRETIZATION_POINTS [executor]" @@ -341,15 +351,16 @@ int main(int argc, char *argv[]) std::exit(-1); } - const int discretization_points = argc >= 2 ? std::atoi(argv[1]) : 100; + const IndexType discretization_points = + argc >= 2 ? std::atoi(argv[1]) : 100; const auto executor_string = argc >= 3 ? argv[2] : "reference"; - const double alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha; - const double beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta; - const double gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma; - const double delta_c = argc >= 7 ? std::atof(argv[6]) : default_delta; + const ValueType alpha_c = argc >= 4 ? std::atof(argv[3]) : default_alpha; + const ValueType beta_c = argc >= 5 ? std::atof(argv[4]) : default_beta; + const ValueType gamma_c = argc >= 6 ? std::atof(argv[5]) : default_gamma; + const ValueType delta_c = argc >= 7 ? std::atof(argv[6]) : default_delta; // clang-format off - std::array coefs{ + std::array coefs{ delta_c, gamma_c, delta_c, gamma_c, beta_c, gamma_c, delta_c, gamma_c, delta_c, @@ -369,40 +380,47 @@ int main(int argc, char *argv[]) const size_t dp_3 = dp * dp * dp; // problem: - auto correct_u = [](double x, double y, double z) { + auto correct_u = [](ValueType x, ValueType y, ValueType z) { return x * x * x + y * y * y + z * z * z; }; - auto f = [](double x, double y, double z) { return 6 * x + 6 * y + 6 * z; }; + auto f = [](ValueType x, ValueType y, ValueType z) { + return ValueType(6) * x + ValueType(6) * y + ValueType(6) * z; + }; // matrix - std::vector row_ptrs(dp_3 + 1); - std::vector col_idxs((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2)); - std::vector values((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2)); + std::vector row_ptrs(dp_3 + 1); + std::vector col_idxs((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2)); + std::vector values((3 * dp - 2) * (3 * dp - 2) * (3 * dp - 2)); // right hand side - std::vector rhs(dp_3); + std::vector rhs(dp_3); // solution - std::vector u(dp_3, 0.0); + std::vector u(dp_3, 0.0); generate_stencil_matrix(dp, row_ptrs.data(), col_idxs.data(), values.data(), coefs.data()); // looking for solution u = x^3: f = 6x, u(0) = 0, u(1) = 1 generate_rhs(dp, f, correct_u, rhs.data(), coefs.data()); - auto start_time = std::chrono::steady_clock::now(); - solve_system(executor_string, dp, row_ptrs.data(), col_idxs.data(), - values.data(), rhs.data(), u.data(), 1e-12); + const gko::remove_complex reduction_factor = 1e-7; + auto start_time = std::chrono::steady_clock::now(); + solve_system(executor_string, dp, row_ptrs.data(), col_idxs.data(), + values.data(), rhs.data(), u.data(), reduction_factor); auto stop_time = std::chrono::steady_clock::now(); - double runtime_duration = - std::chrono::duration_cast(stop_time - - start_time) - .count() * + + const auto runtime_duration = + static_cast( + std::chrono::duration_cast(stop_time - + start_time) + .count()) * 1e-6; - print_solution(dp, u.data()); + print_solution(dp, u.data()); std::cout << "The average relative error is " - << calculate_error(dp, u.data(), correct_u) / dp_3 << std::endl; + << calculate_error(dp, u.data(), correct_u) / + static_cast>(dp_3) + << std::endl; std::cout << "The runtime is " << std::to_string(runtime_duration) << " ms" << std::endl; diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt new file mode 100644 index 00000000000..6ae0ff2c655 --- /dev/null +++ b/hip/CMakeLists.txt @@ -0,0 +1,293 @@ +if(NOT DEFINED ROCM_PATH) + if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") + elseif(DEFINED ENV{HIP_PATH}) + set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed") + else() + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed") + endif() +endif() + +if(NOT DEFINED HIPBLAS_PATH) + if(DEFINED ENV{HIPBLAS_PATH}) + set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed") + else() + set(HIPBLAS_PATH "${ROCM_PATH}/hipblas" CACHE PATH "Path to which HIPBLAS has been installed") + endif() +endif() + +if(NOT DEFINED HIPSPARSE_PATH) + if(DEFINED ENV{HIPSPARSE_PATH}) + set(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH} CACHE PATH "Path to which HIPSPARSE has been installed") + else() + set(HIPSPARSE_PATH "${ROCM_PATH}/hipsparse" CACHE PATH "Path to which HIPSPARSE has been installed") + endif() +endif() + +## Both the definition of `HCC_PATH` and `HIP_HIPCC_CMAKE_LINKER_HELPER` are required +## before including `FindHIP`, as these are essential but not defined in the beginning +## of the `FindHIP` file itself. Not defining these currently results in: +## 1. Without `HCC_PATH`: the `hcc` backend not working properly if it is wrongly set, +## if it is not set, popentially all compilation could fail. +## 2. Without `HIP_HIPCC_CMAKE_LINKER_HELPER` two compilations are required, since +## `FindHIP` defines this only in macro calls, which we call much later on after +## including the file itself. +if(NOT DEFINED HCC_PATH) + if(DEFINED ENV{HCC_PATH}) + set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed") + else() + set(HCC_PATH "${ROCM_PATH}/hcc" CACHE PATH "Path to which HCC has been installed") + endif() + set(HCC_HOME "${HCC_PATH}") +endif() + +if(NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed") + endif() +endif() + +# Find HIPCC_CMAKE_LINKER_HELPER executable +find_program( + HIP_HIPCC_CMAKE_LINKER_HELPER + NAMES hipcc_cmake_linker_helper + PATHS + "${HIP_ROOT_DIR}" + ENV ROCM_PATH + ENV HIP_PATH + /opt/rocm + /opt/rocm/hip + PATH_SUFFIXES bin + NO_DEFAULT_PATH +) +if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER) + # Now search in default paths + find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper) +endif() + +find_program( + HIP_HIPCONFIG_EXECUTABLE + NAMES hipconfig + PATHS + "${HIP_ROOT_DIR}" + ENV ROCM_PATH + ENV HIP_PATH + /opt/rocm + /opt/rocm/hip + PATH_SUFFIXES bin + NO_DEFAULT_PATH +) +if(NOT HIP_HIPCONFIG_EXECUTABLE) + # Now search in default paths + find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig) +endif() + +execute_process( + COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version + OUTPUT_VARIABLE GINKGO_HIP_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE + ) +set(GINKGO_HIP_VERSION ${GINKGO_HIP_VERSION} PARENT_SCOPE) + +if (GINKGO_HIP_PLATFORM MATCHES "nvcc") # ensure ENV{CUDA_PATH} is set by the user + if (NOT DEFINED ENV{CUDA_PATH}) + find_path(GINKGO_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH) + if (NOT GINKGO_HIP_DEFAULT_CUDA_PATH) + message(FATAL_ERROR "HIP nvcc backend was requested but CUDA could not be located. " + "Set and export the environment variable CUDA_PATH.") + endif() + endif() +endif() + +if (GINKGO_HIP_PLATFORM STREQUAL "hcc") + # This is required by hipblas/hipsparse in the case where the platform is hcc. + # For nvcc platform, these aren't required and only cause trouble. + list(APPEND CMAKE_PREFIX_PATH + "${HIP_PATH}/lib/cmake" + "${HIP_PATH}/../lib/cmake" # hopefully catches all extra HIP dependencies, e.g. hcc + ) +endif() + + +## Setup all CMAKE variables to find HIP and its dependencies +list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake") +list(APPEND CMAKE_PREFIX_PATH + "${HIPBLAS_PATH}/lib/cmake" + "${HIPSPARSE_PATH}/lib/cmake" +) +# Set CMAKE_MODULE_PATH and CMAKE_PREFIX_PATH as PARENT_SCOPE to easily find HIP again +set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE) +set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH}" PARENT_SCOPE) + +# setting the default flags like CMAKE_{LANG}_FLAGS_{TYPE} +# the setting is copied from the default CMAKE_CXX_FLAGS_{TYPE} +set(HIP_HIPCC_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the HIPCC compiler during DEBUG builds") +set(HIP_HIPCC_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the HIPCC compiler during MINSIZEREL builds") +set(HIP_HIPCC_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the HIPCC compiler during RELEASE builds") +set(HIP_HIPCC_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the HIPCC compiler during RELWITHDEBINFO builds") + +find_package(HIP REQUIRED) +find_package(hipblas REQUIRED) +find_package(hipsparse REQUIRED) +find_path(GINKGO_HIP_THRUST_PATH "thrust/complex.h" + PATHS "${HIP_PATH}/../include" + ENV HIP_THRUST_PATH) +if (NOT GINKGO_HIP_THRUST_PATH) + message(FATAL_ERROR "Could not find the ROCm header thrust/complex.h which is required by Ginkgo HIP.") +endif() + +set(GINKGO_HIP_SOURCES + base/exception.hip.cpp + base/executor.hip.cpp + base/version.hip.cpp + components/fill_array.hip.cpp + components/precision_conversion.hip.cpp + components/prefix_sum.hip.cpp + factorization/ilu_kernels.hip.cpp + factorization/factorization_kernels.hip.cpp + factorization/par_ict_kernels.hip.cpp + factorization/par_ilu_kernels.hip.cpp + factorization/par_ilut_approx_filter_kernel.hip.cpp + factorization/par_ilut_filter_kernel.hip.cpp + factorization/par_ilut_select_common.hip.cpp + factorization/par_ilut_select_kernel.hip.cpp + factorization/par_ilut_spgeam_kernel.hip.cpp + factorization/par_ilut_sweep_kernel.hip.cpp + matrix/coo_kernels.hip.cpp + matrix/csr_kernels.hip.cpp + matrix/dense_kernels.hip.cpp + matrix/ell_kernels.hip.cpp + matrix/hybrid_kernels.hip.cpp + matrix/sellp_kernels.hip.cpp + matrix/sparsity_csr_kernels.hip.cpp + preconditioner/isai_kernels.hip.cpp + preconditioner/jacobi_advanced_apply_kernel.hip.cpp + preconditioner/jacobi_generate_kernel.hip.cpp + preconditioner/jacobi_kernels.hip.cpp + preconditioner/jacobi_simple_apply_kernel.hip.cpp + solver/bicg_kernels.hip.cpp + solver/bicgstab_kernels.hip.cpp + solver/cg_kernels.hip.cpp + solver/cgs_kernels.hip.cpp + solver/fcg_kernels.hip.cpp + solver/gmres_kernels.hip.cpp + solver/ir_kernels.hip.cpp + solver/lower_trs_kernels.hip.cpp + solver/upper_trs_kernels.hip.cpp + stop/criterion_kernels.hip.cpp + stop/residual_norm_kernels.hip.cpp) + +set(GINKGO_HIP_NVCC_ARCH "") +if (GINKGO_HIP_PLATFORM MATCHES "nvcc") + if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) + elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) + unset(CMAKE_CUDA_HOST_COMPILER CACHE) + endif() + if (CMAKE_CUDA_HOST_COMPILER) + set(GINKGO_HIP_CUDA_HOST_COMPILER "-ccbin=${CMAKE_CUDA_HOST_COMPILER}") + endif() + + # Remove false positive CUDA warnings when calling one() and zero() + # This creates a compilation bug on nvcc 9.0.102 *with* the new array_deleter + # merged at commit ed12b3df5d26 + if(NOT CMAKE_CUDA_COMPILER_VERSION MATCHES "9.0") + set(GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr) + endif() + # add gpu architecture flags + include(CudaArchitectureSelector) + cas_target_cuda_architectures_plain(GINKGO_HIP_NVCC_ARCH + ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} + UNSUPPORTED "20" "21") +endif() +set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS}) +set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) +set(GINKGO_HIP_HCC_OPTIONS ${GINKGO_HIP_HCC_COMPILER_FLAGS}) +set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS}) + +set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) +if (GINKGO_HIP_VERSION VERSION_GREATER_EQUAL "3.5") + hip_add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES} + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} "-std=c++11" + HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS} + CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} ${GINKGO_HIP_CUDA_HOST_COMPILER} + ${GINKGO_STATIC_OR_SHARED}) +else() + hip_add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES} + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} "-std=c++11" + HCC_OPTIONS ${GINKGO_HIP_HCC_OPTIONS} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} ${GINKGO_HIP_CUDA_HOST_COMPILER} + ${GINKGO_STATIC_OR_SHARED}) +endif() + +if(GINKGO_HIP_AMDGPU AND GINKGO_HIP_PLATFORM MATCHES "hcc") + foreach(target ${GINKGO_HIP_AMDGPU}) + target_compile_options(ginkgo_hip PRIVATE --amdgpu-target=${target}) + target_link_libraries(ginkgo_hip PRIVATE --amdgpu-target=${target}) + endforeach() +endif() + +target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) +if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH) + set_property(TARGET ginkgo_hip PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*") +endif() +if(GINKGO_WITH_IWYU AND GINKGO_IWYU_PATH) + set_property(TARGET ginkgo_hip PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${GINKGO_IWYU_PATH}) +endif() + +if(GINKGO_HIP_PLATFORM MATCHES "hcc") + # Fix the exception thrown bug with `hcc` backend and shared libraries + set_target_properties(ginkgo_hip PROPERTIES LINKER_LANGUAGE HIP) + + # Ban `-hc` flag as INTERFACE_LINK_LIBRARIES since that is propagated when building + # a static library, and it's definitely not a known option to any compiler. + ginkgo_hip_ban_link_hcflag(hcc::hccrt) + + if (NOT BUILD_SHARED_LIBS) + # Do not let hip::device flags propagate to executables which don't + # directly use HIP + ginkgo_hip_clang_ban_hip_device_flags() + endif() + target_link_libraries(ginkgo_hip PRIVATE hip::device) +elseif(GINKGO_HIP_PLATFORM MATCHES "nvcc") + find_package(CUDA 9.0 REQUIRED) + target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES}) + set(HIP_CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE) +endif() + +target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse) + +target_include_directories(ginkgo_hip + PUBLIC + ${HIP_INCLUDE_DIRS} + PRIVATE + ${GINKGO_HIP_THRUST_PATH} + ${HIPBLAS_INCLUDE_DIRS} + ${HIPSPARSE_INCLUDE_DIRS} + $) + +ginkgo_compile_features(ginkgo_hip) +ginkgo_default_includes(ginkgo_hip) +ginkgo_install_library(ginkgo_hip hip) + +if (GINKGO_CHECK_CIRCULAR_DEPS) + ginkgo_check_headers(ginkgo_hip) +endif() + +if(GINKGO_BUILD_TESTS) + # Here, we go through all of Ginkgo's dependencies to build a `-Wl,-rpath` string since for + # some reason `hipcc` through CMake does not have rpath settings unlike the other compilers. + get_target_property(GINKGO_LINK_LIBRARIES ginkgo LINK_LIBRARIES) + set(GINKGO_RPATH_FOR_HIP "-Wl,-rpath,$") + foreach(target ${GINKGO_LINK_LIBRARIES}) + if("${target}" MATCHES "^ginkgo") + set(GINKGO_RPATH_FOR_HIP "${GINKGO_RPATH_FOR_HIP}:$") + endif() + endforeach() + + add_subdirectory(test) +endif() diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp new file mode 100644 index 00000000000..d698a6a8d83 --- /dev/null +++ b/hip/base/config.hip.hpp @@ -0,0 +1,97 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_CONFIG_HIP_HPP_ +#define GKO_HIP_BASE_CONFIG_HIP_HPP_ + + +#include + + +#include + + +#include + + +#include "hip/base/math.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +struct config { + /** + * The type containing a bitmask over all lanes of a warp. + */ +#if GINKGO_HIP_PLATFORM_HCC + using lane_mask_type = uint64; +#else // GINKGO_HIP_PLATFORM_NVCC + using lane_mask_type = uint32; +#endif + + /** + * The number of threads within a HIP warp. Here, we use the definition from + * `device_functions.h`. + */ +#if GINKGO_HIP_PLATFORM_HCC + static constexpr uint32 warp_size = warpSize; +#else // GINKGO_HIP_PLATFORM_NVCC + static constexpr uint32 warp_size = 32; +#endif + + /** + * The bitmask of the entire warp. + */ + static constexpr auto full_lane_mask = ~zero(); + + /** + * The maximal number of threads allowed in a HIP warp. + */ + static constexpr uint32 max_block_size = 1024; + + /** + * The minimal amount of warps that need to be scheduled for each block + * to maximize GPU occupancy. + */ + static constexpr uint32 min_warps_per_block = 4; +}; + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_CONFIG_HIP_HPP_ diff --git a/hip/base/device_guard.hip.hpp b/hip/base/device_guard.hip.hpp new file mode 100644 index 00000000000..b7d63ebc152 --- /dev/null +++ b/hip/base/device_guard.hip.hpp @@ -0,0 +1,93 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_ +#define GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_ + + +#include + + +#include + + +#include + + +namespace gko { +namespace hip { + + +/** + * This class defines a device guard for the hip functions and the hip module. + * The guard is used to make sure that the device code is run on the correct + * hip device, when run with multiple devices. The class records the current + * device id and uses `hipSetDevice` to set the device id to the one being + * passed in. After the scope has been exited, the destructor sets the device_id + * back to the one before entering the scope. + */ +class device_guard { +public: + device_guard(int device_id) + { + GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&original_device_id)); + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(device_id)); + } + + device_guard(device_guard &other) = delete; + + device_guard &operator=(const device_guard &other) = delete; + + device_guard(device_guard &&other) = delete; + + device_guard const &operator=(device_guard &&other) = delete; + + ~device_guard() noexcept(false) + { + /* Ignore the error during stack unwinding for this call */ + if (std::uncaught_exception()) { + hipSetDevice(original_device_id); + } else { + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(original_device_id)); + } + } + +private: + int original_device_id{}; +}; + + +} // namespace hip +} // namespace gko + + +#endif // GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_ diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp new file mode 100644 index 00000000000..9e6f2ff7a00 --- /dev/null +++ b/hip/base/exception.hip.cpp @@ -0,0 +1,101 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include + + +namespace gko { + + +std::string HipError::get_error(int64 error_code) +{ + std::string name = hipGetErrorName(static_cast(error_code)); + std::string message = + hipGetErrorString(static_cast(error_code)); + return name + ": " + message; +} + + +std::string HipblasError::get_error(int64 error_code) +{ +#define GKO_REGISTER_HIPBLAS_ERROR(error_name) \ + if (error_code == static_cast(error_name)) { \ + return #error_name; \ + } + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_SUCCESS); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_NOT_INITIALIZED); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_ALLOC_FAILED); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_INVALID_VALUE); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_ARCH_MISMATCH); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_MAPPING_ERROR); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_EXECUTION_FAILED); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_INTERNAL_ERROR); + GKO_REGISTER_HIPBLAS_ERROR(HIPBLAS_STATUS_NOT_SUPPORTED); + return "Unknown error"; + +#undef GKO_REGISTER_HIPBLAS_ERROR +} + + +std::string HipsparseError::get_error(int64 error_code) +{ +#define GKO_REGISTER_HIPSPARSE_ERROR(error_name) \ + if (error_code == int64(error_name)) { \ + return #error_name; \ + } + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_SUCCESS); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_NOT_INITIALIZED); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_ALLOC_FAILED); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_INVALID_VALUE); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_ARCH_MISMATCH); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_MAPPING_ERROR); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_EXECUTION_FAILED); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_INTERNAL_ERROR); + GKO_REGISTER_HIPSPARSE_ERROR(HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + return "Unknown error"; + +#undef GKO_REGISTER_HIPSPARSE_ERROR +} + + +} // namespace gko diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp new file mode 100644 index 00000000000..9592bc20b8d --- /dev/null +++ b/hip/base/executor.hip.cpp @@ -0,0 +1,226 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include + + +#include "hip/base/config.hip.hpp" +#include "hip/base/device_guard.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" + + +namespace gko { + + +#include "common/base/executor.hpp.inc" + + +std::shared_ptr HipExecutor::create( + int device_id, std::shared_ptr master, bool device_reset) +{ + return std::shared_ptr( + new HipExecutor(device_id, std::move(master), device_reset), + [device_id](HipExecutor *exec) { + delete exec; + if (!HipExecutor::get_num_execs(device_id) && + exec->get_device_reset()) { + hip::device_guard g(device_id); + hipDeviceReset(); + } + }); +} + + +void OmpExecutor::raw_copy_to(const HipExecutor *dest, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const +{ + if (num_bytes > 0) { + hip::device_guard g(dest->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS( + hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyHostToDevice)); + } +} + + +void HipExecutor::raw_free(void *ptr) const noexcept +{ + hip::device_guard g(this->get_device_id()); + auto error_code = hipFree(ptr); + if (error_code != hipSuccess) { +#if GKO_VERBOSE_LEVEL >= 1 + // Unfortunately, if memory free fails, there's not much we can do + std::cerr << "Unrecoverable HIP error on device " << this->device_id_ + << " in " << __func__ << ": " << hipGetErrorName(error_code) + << ": " << hipGetErrorString(error_code) << std::endl + << "Exiting program" << std::endl; +#endif + std::exit(error_code); + } +} + + +void *HipExecutor::raw_alloc(size_type num_bytes) const +{ + void *dev_ptr = nullptr; + hip::device_guard g(this->get_device_id()); + auto error_code = hipMalloc(&dev_ptr, num_bytes); + if (error_code != hipErrorMemoryAllocation) { + GKO_ASSERT_NO_HIP_ERRORS(error_code); + } + GKO_ENSURE_ALLOCATED(dev_ptr, "hip", num_bytes); + return dev_ptr; +} + + +void HipExecutor::raw_copy_to(const OmpExecutor *, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const +{ + if (num_bytes > 0) { + hip::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS( + hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyDeviceToHost)); + } +} + + +void HipExecutor::raw_copy_to(const CudaExecutor *src, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const +{ +#if GINKGO_HIP_PLATFORM_NVCC == 1 + if (num_bytes > 0) { + hip::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, this->device_id_, + src_ptr, src->get_device_id(), + num_bytes)); + } +#else + GKO_NOT_SUPPORTED(this); +#endif +} + + +void HipExecutor::raw_copy_to(const HipExecutor *src, size_type num_bytes, + const void *src_ptr, void *dest_ptr) const +{ + if (num_bytes > 0) { + hip::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, this->device_id_, + src_ptr, src->get_device_id(), + num_bytes)); + } +} + + +void HipExecutor::synchronize() const +{ + hip::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceSynchronize()); +} + + +void HipExecutor::run(const Operation &op) const +{ + this->template log(this, &op); + hip::device_guard g(this->get_device_id()); + op.run( + std::static_pointer_cast(this->shared_from_this())); + this->template log(this, &op); +} + + +int HipExecutor::get_num_devices() +{ + int deviceCount = 0; + auto error_code = hipGetDeviceCount(&deviceCount); + if (error_code == hipErrorNoDevice) { + return 0; + } + GKO_ASSERT_NO_HIP_ERRORS(error_code); + return deviceCount; +} + + +void HipExecutor::set_gpu_property() +{ + if (device_id_ < this->get_num_devices() && device_id_ >= 0) { + hip::device_guard g(this->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &num_multiprocessor_, hipDeviceAttributeMultiprocessorCount, + device_id_)); + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &major_, hipDeviceAttributeComputeCapabilityMajor, device_id_)); + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &minor_, hipDeviceAttributeComputeCapabilityMinor, device_id_)); +#if GINKGO_HIP_PLATFORM_NVCC + num_warps_per_sm_ = convert_sm_ver_to_cores(major_, minor_) / + kernels::hip::config::warp_size; +#else + // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD + // Reference: https://en.wikipedia.org/wiki/Graphics_Core_Next + num_warps_per_sm_ = 4; +#endif // GINKGO_HIP_PLATFORM_NVCC + warp_size_ = kernels::hip::config::warp_size; + } +} + + +void HipExecutor::init_handles() +{ + if (device_id_ < this->get_num_devices() && device_id_ >= 0) { + const auto id = this->get_device_id(); + hip::device_guard g(id); + this->hipblas_handle_ = handle_manager( + kernels::hip::hipblas::init(), [id](hipblasContext *handle) { + hip::device_guard g(id); + kernels::hip::hipblas::destroy_hipblas_handle(handle); + }); + this->hipsparse_handle_ = handle_manager( + kernels::hip::hipsparse::init(), [id](hipsparseContext *handle) { + hip::device_guard g(id); + kernels::hip::hipsparse::destroy_hipsparse_handle(handle); + }); + } +} + + +} // namespace gko diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp new file mode 100644 index 00000000000..7bef3278f79 --- /dev/null +++ b/hip/base/hipblas_bindings.hip.hpp @@ -0,0 +1,275 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ +#define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +/** + * @brief The device specific kernels namespace. + * + * @ingroup kernels + */ +namespace kernels { +/** + * @brief The HIP namespace. + * + * @ingroup hip + */ +namespace hip { +/** + * @brief The HIPBLAS namespace. + * + * @ingroup hipblas + */ +namespace hipblas { +/** + * @brief The detail namespace. + * + * @ingroup detail + */ +namespace detail { + + +template +inline int64 not_implemented(Args &&...) +{ + return static_cast(HIPBLAS_STATUS_NOT_SUPPORTED); +} + + +} // namespace detail + + +template +struct is_supported : std::false_type {}; + +template <> +struct is_supported : std::true_type {}; + +template <> +struct is_supported : std::true_type {}; + +// hipblas supports part of complex function version is >= 0.19, but the version +// is not set now. +/* not implemented +template <> +struct is_supported> : std::true_type {}; + +template <> +struct is_supported> : std::true_type {}; +*/ + + +#define GKO_BIND_HIPBLAS_GEMM(ValueType, HipblasName) \ + inline void gemm(hipblasHandle_t handle, hipblasOperation_t transa, \ + hipblasOperation_t transb, int m, int n, int k, \ + const ValueType *alpha, const ValueType *a, int lda, \ + const ValueType *b, int ldb, const ValueType *beta, \ + ValueType *c, int ldc) \ + { \ + GKO_ASSERT_NO_HIPBLAS_ERRORS(HipblasName( \ + handle, transa, transb, m, n, k, as_hiplibs_type(alpha), \ + as_hiplibs_type(a), lda, as_hiplibs_type(b), ldb, \ + as_hiplibs_type(beta), as_hiplibs_type(c), ldc)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPBLAS_GEMM(float, hipblasSgemm); +GKO_BIND_HIPBLAS_GEMM(double, hipblasDgemm); +/* not implemented +GKO_BIND_HIPBLAS_GEMM(std::complex, hipblasCgemm); +GKO_BIND_HIPBLAS_GEMM(std::complex, hipblasZgemm); +*/ +template +GKO_BIND_HIPBLAS_GEMM(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPBLAS_GEMM + + +#define GKO_BIND_HIPBLAS_GEAM(ValueType, HipblasName) \ + inline void geam(hipblasHandle_t handle, hipblasOperation_t transa, \ + hipblasOperation_t transb, int m, int n, \ + const ValueType *alpha, const ValueType *a, int lda, \ + const ValueType *beta, const ValueType *b, int ldb, \ + ValueType *c, int ldc) \ + { \ + GKO_ASSERT_NO_HIPBLAS_ERRORS( \ + HipblasName(handle, transa, transb, m, n, as_hiplibs_type(alpha), \ + as_hiplibs_type(a), lda, as_hiplibs_type(beta), \ + as_hiplibs_type(b), ldb, as_hiplibs_type(c), ldc)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPBLAS_GEAM(float, hipblasSgeam); +GKO_BIND_HIPBLAS_GEAM(double, hipblasDgeam); +// Hipblas does not provide geam complex version yet. +template +GKO_BIND_HIPBLAS_GEAM(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPBLAS_GEAM + + +#define GKO_BIND_HIPBLAS_SCAL(ValueType, HipblasName) \ + inline void scal(hipblasHandle_t handle, int n, const ValueType *alpha, \ + ValueType *x, int incx) \ + { \ + GKO_ASSERT_NO_HIPBLAS_ERRORS(HipblasName( \ + handle, n, as_hiplibs_type(alpha), as_hiplibs_type(x), incx)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPBLAS_SCAL(float, hipblasSscal); +GKO_BIND_HIPBLAS_SCAL(double, hipblasDscal); +/* not implemented +GKO_BIND_HIPBLAS_SCAL(std::complex, hipblasCscal); +GKO_BIND_HIPBLAS_SCAL(std::complex, hipblasZscal); +*/ +template +GKO_BIND_HIPBLAS_SCAL(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPBLAS_SCAL + + +#define GKO_BIND_HIPBLAS_AXPY(ValueType, HipblasName) \ + inline void axpy(hipblasHandle_t handle, int n, const ValueType *alpha, \ + const ValueType *x, int incx, ValueType *y, int incy) \ + { \ + GKO_ASSERT_NO_HIPBLAS_ERRORS( \ + HipblasName(handle, n, as_hiplibs_type(alpha), as_hiplibs_type(x), \ + incx, as_hiplibs_type(y), incy)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPBLAS_AXPY(float, hipblasSaxpy); +GKO_BIND_HIPBLAS_AXPY(double, hipblasDaxpy); +/* not implemented +GKO_BIND_HIPBLAS_AXPY(std::complex, hipblasCaxpy); +GKO_BIND_HIPBLAS_AXPY(std::complex, hipblasZaxpy); +*/ +template +GKO_BIND_HIPBLAS_AXPY(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPBLAS_AXPY + + +#define GKO_BIND_HIPBLAS_DOT(ValueType, HipblasName) \ + inline void dot(hipblasHandle_t handle, int n, const ValueType *x, \ + int incx, const ValueType *y, int incy, ValueType *result) \ + { \ + GKO_ASSERT_NO_HIPBLAS_ERRORS( \ + HipblasName(handle, n, as_hiplibs_type(x), incx, \ + as_hiplibs_type(y), incy, as_hiplibs_type(result))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPBLAS_DOT(float, hipblasSdot); +GKO_BIND_HIPBLAS_DOT(double, hipblasDdot); +/* not implemented +GKO_BIND_HIPBLAS_DOT(std::complex, hipblasCdotc); +GKO_BIND_HIPBLAS_DOT(std::complex, hipblasZdotc); +*/ +template +GKO_BIND_HIPBLAS_DOT(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPBLAS_DOT + + +#define GKO_BIND_HIPBLAS_NORM2(ValueType, HipblasName) \ + inline void norm2(hipblasHandle_t handle, int n, const ValueType *x, \ + int incx, remove_complex *result) \ + { \ + GKO_ASSERT_NO_HIPBLAS_ERRORS(HipblasName( \ + handle, n, as_hiplibs_type(x), incx, as_hiplibs_type(result))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPBLAS_NORM2(float, hipblasSnrm2); +GKO_BIND_HIPBLAS_NORM2(double, hipblasDnrm2); +/* not implemented +GKO_BIND_HIPBLAS_NORM2(std::complex, hipblasScnrm2); +GKO_BIND_HIPBLAS_NORM2(std::complex, hipblasDznrm2); +*/ +template +GKO_BIND_HIPBLAS_NORM2(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPBLAS_NORM2 + + +inline hipblasContext *init() +{ + hipblasHandle_t handle; + GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasCreate(&handle)); + GKO_ASSERT_NO_HIPBLAS_ERRORS( + hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + return reinterpret_cast(handle); +} + + +inline void destroy_hipblas_handle(hipblasContext *handle) +{ + GKO_ASSERT_NO_HIPBLAS_ERRORS( + hipblasDestroy(reinterpret_cast(handle))); +} + + +} // namespace hipblas +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp new file mode 100644 index 00000000000..3b7c8a978a4 --- /dev/null +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -0,0 +1,816 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ +#define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ + + +#include + + +#include +#include + + +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The HIPSPARSE namespace. + * + * @ingroup hipsparse + */ +namespace hipsparse { +/** + * @brief The detail namespace. + * + * @ingroup detail + */ +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); +} + + +} // namespace detail + + +template +struct is_supported : std::false_type {}; + +template <> +struct is_supported : std::true_type {}; + +template <> +struct is_supported : std::true_type {}; + + +#define GKO_BIND_HIPSPARSE32_SPMV(ValueType, HipsparseName) \ + inline void spmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int32 m, int32 n, int32 nnz, const ValueType *alpha, \ + const hipsparseMatDescr_t descrA, \ + const ValueType *csrValA, const int32 *csrRowPtrA, \ + const int32 *csrColIndA, const ValueType *x, \ + const ValueType *beta, ValueType *y) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, transA, m, n, nnz, as_hiplibs_type(alpha), descrA, \ + as_hiplibs_type(csrValA), csrRowPtrA, csrColIndA, \ + as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_SPMV(ValueType, HipsparseName) \ + inline void spmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int64 m, int64 n, int64 nnz, const ValueType *alpha, \ + const hipsparseMatDescr_t descrA, \ + const ValueType *csrValA, const int64 *csrRowPtrA, \ + const int64 *csrColIndA, const ValueType *x, \ + const ValueType *beta, ValueType *y) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_SPMV(float, hipsparseScsrmv); +GKO_BIND_HIPSPARSE32_SPMV(double, hipsparseDcsrmv); +GKO_BIND_HIPSPARSE64_SPMV(float, hipsparseScsrmv); +GKO_BIND_HIPSPARSE64_SPMV(double, hipsparseDcsrmv); +template +GKO_BIND_HIPSPARSE32_SPMV(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_SPMV(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_SPMV +#undef GKO_BIND_HIPSPARSE64_SPMV + + +#define GKO_BIND_HIPSPARSE32_SPMM(ValueType, HipsparseName) \ + inline void spmm(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int32 m, int32 n, int32 k, int32 nnz, \ + const ValueType *alpha, const hipsparseMatDescr_t descrA, \ + const ValueType *csrValA, const int32 *csrRowPtrA, \ + const int32 *csrColIndA, const ValueType *B, int32 ldb, \ + const ValueType *beta, ValueType *C, int32 ldc) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, transA, m, n, k, nnz, as_hiplibs_type(alpha), descrA, \ + as_hiplibs_type(csrValA), csrRowPtrA, csrColIndA, \ + as_hiplibs_type(B), ldb, as_hiplibs_type(beta), \ + as_hiplibs_type(C), ldc)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_SPMM(ValueType, HipsparseName) \ + inline void spmm(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int64 m, int64 n, int64 k, int64 nnz, \ + const ValueType *alpha, const hipsparseMatDescr_t descrA, \ + const ValueType *csrValA, const int64 *csrRowPtrA, \ + const int64 *csrColIndA, const ValueType *B, int64 ldb, \ + const ValueType *beta, ValueType *C, int64 ldc) \ + GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_SPMM(float, hipsparseScsrmm); +GKO_BIND_HIPSPARSE32_SPMM(double, hipsparseDcsrmm); +GKO_BIND_HIPSPARSE64_SPMM(float, hipsparseScsrmm); +GKO_BIND_HIPSPARSE64_SPMM(double, hipsparseDcsrmm); +template +GKO_BIND_HIPSPARSE32_SPMM(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_SPMM(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_SPMM +#undef GKO_BIND_HIPSPARSE64_SPMM + + +#define GKO_BIND_HIPSPARSE32_SPMV(ValueType, HipsparseName) \ + inline void spmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + const ValueType *alpha, const hipsparseMatDescr_t descrA, \ + const hipsparseHybMat_t hybA, const ValueType *x, \ + const ValueType *beta, ValueType *y) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, transA, as_hiplibs_type(alpha), descrA, hybA, \ + as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_SPMV(float, hipsparseShybmv); +GKO_BIND_HIPSPARSE32_SPMV(double, hipsparseDhybmv); +template +GKO_BIND_HIPSPARSE32_SPMV(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_SPMV + + +template +void spgemm_buffer_size( + hipsparseHandle_t handle, IndexType m, IndexType n, IndexType k, + const ValueType *alpha, const hipsparseMatDescr_t descrA, IndexType nnzA, + const IndexType *csrRowPtrA, const IndexType *csrColIndA, + const hipsparseMatDescr_t descrB, IndexType nnzB, + const IndexType *csrRowPtrB, const IndexType *csrColIndB, + const ValueType *beta, const hipsparseMatDescr_t descrD, IndexType nnzD, + const IndexType *csrRowPtrD, const IndexType *csrColIndD, + csrgemm2Info_t info, size_type &result) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(ValueType, HipsparseName) \ + template <> \ + inline void spgemm_buffer_size( \ + hipsparseHandle_t handle, int32 m, int32 n, int32 k, \ + const ValueType *alpha, const hipsparseMatDescr_t descrA, int32 nnzA, \ + const int32 *csrRowPtrA, const int32 *csrColIndA, \ + const hipsparseMatDescr_t descrB, int32 nnzB, const int32 *csrRowPtrB, \ + const int32 *csrColIndB, const ValueType *beta, \ + const hipsparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD, \ + const int32 *csrColIndD, csrgemm2Info_t info, size_type &result) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, m, n, k, as_hiplibs_type(alpha), descrA, nnzA, csrRowPtrA, \ + csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, \ + as_hiplibs_type(beta), descrD, nnzD, csrRowPtrD, csrColIndD, info, \ + &result)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(float, hipsparseScsrgemm2_bufferSizeExt); +GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(double, hipsparseDcsrgemm2_bufferSizeExt); +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(std::complex, + hipsparseCcsrgemm2_bufferSizeExt); +GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE(std::complex, + hipsparseZcsrgemm2_bufferSizeExt); +#endif // hipsparse version >= 1.4 + + +#undef GKO_BIND_HIPSPARSE_SPGEMM_BUFFER_SIZE + + +template +void spgemm_nnz(hipsparseHandle_t handle, IndexType m, IndexType n, IndexType k, + const hipsparseMatDescr_t descrA, IndexType nnzA, + const IndexType *csrRowPtrA, const IndexType *csrColIndA, + const hipsparseMatDescr_t descrB, IndexType nnzB, + const IndexType *csrRowPtrB, const IndexType *csrColIndB, + const hipsparseMatDescr_t descrD, IndexType nnzD, + const IndexType *csrRowPtrD, const IndexType *csrColIndD, + const hipsparseMatDescr_t descrC, IndexType *csrRowPtrC, + IndexType *nnzC, csrgemm2Info_t info, + void *buffer) GKO_NOT_IMPLEMENTED; + +template <> +inline void spgemm_nnz( + hipsparseHandle_t handle, int32 m, int32 n, int32 k, + const hipsparseMatDescr_t descrA, int32 nnzA, const int32 *csrRowPtrA, + const int32 *csrColIndA, const hipsparseMatDescr_t descrB, int32 nnzB, + const int32 *csrRowPtrB, const int32 *csrColIndB, + const hipsparseMatDescr_t descrD, int32 nnzD, const int32 *csrRowPtrD, + const int32 *csrColIndD, const hipsparseMatDescr_t descrC, + int32 *csrRowPtrC, int32 *nnzC, csrgemm2Info_t info, void *buffer) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseXcsrgemm2Nnz( + handle, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB, + csrRowPtrB, csrColIndB, descrD, nnzD, csrRowPtrD, csrColIndD, descrC, + csrRowPtrC, nnzC, info, buffer)); +} + + +template +void spgemm(hipsparseHandle_t handle, IndexType m, IndexType n, IndexType k, + const ValueType *alpha, const hipsparseMatDescr_t descrA, + IndexType nnzA, const ValueType *csrValA, + const IndexType *csrRowPtrA, const IndexType *csrColIndA, + const hipsparseMatDescr_t descrB, IndexType nnzB, + const ValueType *csrValB, const IndexType *csrRowPtrB, + const IndexType *csrColIndB, const ValueType *beta, + const hipsparseMatDescr_t descrD, IndexType nnzD, + const ValueType *csrValD, const IndexType *csrRowPtrD, + const IndexType *csrColIndD, const hipsparseMatDescr_t descrC, + ValueType *csrValC, const IndexType *csrRowPtrC, + IndexType *csrColIndC, csrgemm2Info_t info, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_HIPSPARSE_SPGEMM(ValueType, HipsparseName) \ + template <> \ + inline void spgemm( \ + hipsparseHandle_t handle, int32 m, int32 n, int32 k, \ + const ValueType *alpha, const hipsparseMatDescr_t descrA, int32 nnzA, \ + const ValueType *csrValA, const int32 *csrRowPtrA, \ + const int32 *csrColIndA, const hipsparseMatDescr_t descrB, int32 nnzB, \ + const ValueType *csrValB, const int32 *csrRowPtrB, \ + const int32 *csrColIndB, const ValueType *beta, \ + const hipsparseMatDescr_t descrD, int32 nnzD, \ + const ValueType *csrValD, const int32 *csrRowPtrD, \ + const int32 *csrColIndD, const hipsparseMatDescr_t descrC, \ + ValueType *csrValC, const int32 *csrRowPtrC, int32 *csrColIndC, \ + csrgemm2Info_t info, void *buffer) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, m, n, k, as_hiplibs_type(alpha), descrA, nnzA, \ + as_hiplibs_type(csrValA), csrRowPtrA, csrColIndA, descrB, nnzB, \ + as_hiplibs_type(csrValB), csrRowPtrB, csrColIndB, \ + as_hiplibs_type(beta), descrD, nnzD, as_hiplibs_type(csrValD), \ + csrRowPtrD, csrColIndD, descrC, as_hiplibs_type(csrValC), \ + csrRowPtrC, csrColIndC, info, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_SPGEMM(float, hipsparseScsrgemm2); +GKO_BIND_HIPSPARSE_SPGEMM(double, hipsparseDcsrgemm2); +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +GKO_BIND_HIPSPARSE_SPGEMM(std::complex, hipsparseCcsrgemm2); +GKO_BIND_HIPSPARSE_SPGEMM(std::complex, hipsparseZcsrgemm2); +#endif // hipsparse version >= 1.4 + + +#undef GKO_BIND_HIPSPARSE_SPGEMM + + +#define GKO_BIND_HIPSPARSE32_CSR2HYB(ValueType, HipsparseName) \ + inline void csr2hyb(hipsparseHandle_t handle, int32 m, int32 n, \ + const hipsparseMatDescr_t descrA, \ + const ValueType *csrValA, const int32 *csrRowPtrA, \ + const int32 *csrColIndA, hipsparseHybMat_t hybA, \ + int32 userEllWidth, \ + hipsparseHybPartition_t partitionType) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, m, n, descrA, as_hiplibs_type(csrValA), csrRowPtrA, \ + csrColIndA, hybA, userEllWidth, partitionType)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_CSR2HYB(ValueType, HipsparseName) \ + inline void csr2hyb( \ + hipsparseHandle_t handle, int64 m, int64 n, \ + const hipsparseMatDescr_t descrA, const ValueType *csrValA, \ + const int64 *csrRowPtrA, const int64 *csrColIndA, \ + hipsparseHybMat_t hybA, int64 userEllWidth, \ + hipsparseHybPartition_t partitionType) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_CSR2HYB(float, hipsparseScsr2hyb); +GKO_BIND_HIPSPARSE32_CSR2HYB(double, hipsparseDcsr2hyb); +GKO_BIND_HIPSPARSE64_CSR2HYB(float, hipsparseScsr2hyb); +GKO_BIND_HIPSPARSE64_CSR2HYB(double, hipsparseDcsr2hyb); +template +GKO_BIND_HIPSPARSE32_CSR2HYB(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_CSR2HYB(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_CSR2HYB +#undef GKO_BIND_HIPSPARSE64_CSR2HYB + + +#define GKO_BIND_HIPSPARSE_TRANSPOSE32(ValueType, HipsparseName) \ + inline void transpose(hipsparseHandle_t handle, size_type m, size_type n, \ + size_type nnz, const ValueType *OrigValA, \ + const int32 *OrigRowPtrA, const int32 *OrigColIndA, \ + ValueType *TransValA, int32 *TransRowPtrA, \ + int32 *TransColIndA, hipsparseAction_t copyValues, \ + hipsparseIndexBase_t idxBase) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, m, n, nnz, as_hiplibs_type(OrigValA), OrigRowPtrA, \ + OrigColIndA, as_hiplibs_type(TransValA), TransRowPtrA, \ + TransColIndA, copyValues, idxBase)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE_TRANSPOSE64(ValueType, HipsparseName) \ + inline void transpose(hipsparseHandle_t handle, size_type m, size_type n, \ + size_type nnz, const ValueType *OrigValA, \ + const int64 *OrigRowPtrA, const int64 *OrigColIndA, \ + ValueType *TransValA, int64 *TransRowPtrA, \ + int64 *TransColIndA, hipsparseAction_t copyValues, \ + hipsparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_TRANSPOSE32(float, hipsparseScsr2csc); +GKO_BIND_HIPSPARSE_TRANSPOSE32(double, hipsparseDcsr2csc); +GKO_BIND_HIPSPARSE_TRANSPOSE64(float, hipsparseScsr2csc); +GKO_BIND_HIPSPARSE_TRANSPOSE64(double, hipsparseDcsr2csc); +template +GKO_BIND_HIPSPARSE_TRANSPOSE32(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE_TRANSPOSE64(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPSPARSE_TRANSPOSE + +#define GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(ValueType, HipsparseName) \ + inline void conj_transpose( \ + hipsparseHandle_t handle, size_type m, size_type n, size_type nnz, \ + const ValueType *OrigValA, const int32 *OrigRowPtrA, \ + const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \ + int32 *TransColIndA, hipsparseAction_t copyValues, \ + hipsparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(ValueType, HipsparseName) \ + inline void conj_transpose( \ + hipsparseHandle_t handle, size_type m, size_type n, size_type nnz, \ + const ValueType *OrigValA, const int64 *OrigRowPtrA, \ + const int64 *OrigColIndA, ValueType *TransValA, int64 *TransRowPtrA, \ + int64 *TransColIndA, hipsparseAction_t copyValues, \ + hipsparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(float, hipsparseScsr2csc); +GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(double, hipsparseDcsr2csc); +GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(float, hipsparseScsr2csc); +GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(double, hipsparseDcsr2csc); +template +GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE32(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE64(ValueType, detail::not_implemented); + +#undef GKO_BIND_HIPSPARSE_CONJ_TRANSPOSE + + +#define GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(ValueType, HipsparseName) \ + inline void csrsv2_buffer_size( \ + hipsparseHandle_t handle, hipsparseOperation_t trans, \ + const size_type m, size_type nnz, const hipsparseMatDescr_t descr, \ + const ValueType *csrVal, const int32 *csrRowPtr, \ + const int32 *csrColInd, csrsv2Info_t factor_info, \ + int *factor_work_size) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, trans, m, nnz, descr, \ + as_hiplibs_type(const_cast(csrVal)), csrRowPtr, \ + csrColInd, factor_info, factor_work_size)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(ValueType, HipsparseName) \ + inline void csrsv2_buffer_size( \ + hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \ + size_type nnz, const hipsparseMatDescr_t descr, \ + const ValueType *csrVal, const int64 *csrRowPtr, \ + const int64 *csrColInd, csrsv2Info_t factor_info, \ + int *factor_work_size) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the " \ + "false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(float, hipsparseScsrsv2_bufferSize); +GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(double, hipsparseDcsrsv2_bufferSize); +GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(float, hipsparseScsrsv2_bufferSize); +GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(double, hipsparseDcsrsv2_bufferSize); +template +GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE(ValueType, detail::not_implemented); +#undef GKO_BIND_HIPSPARSE32_CSRSV2_BUFFERSIZE +#undef GKO_BIND_HIPSPARSE64_CSRSV2_BUFFERSIZE + +#define GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(ValueType, HipsparseName) \ + inline void csrsv2_analysis( \ + hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \ + size_type nnz, const hipsparseMatDescr_t descr, \ + const ValueType *csrVal, const int32 *csrRowPtr, \ + const int32 *csrColInd, csrsv2Info_t factor_info, \ + hipsparseSolvePolicy_t policy, void *factor_work_vec) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, trans, m, nnz, descr, as_hiplibs_type(csrVal), csrRowPtr, \ + csrColInd, factor_info, policy, factor_work_vec)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(ValueType, HipsparseName) \ + inline void csrsv2_analysis( \ + hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \ + size_type nnz, const hipsparseMatDescr_t descr, \ + const ValueType *csrVal, const int64 *csrRowPtr, \ + const int64 *csrColInd, csrsv2Info_t factor_info, \ + hipsparseSolvePolicy_t policy, void *factor_work_vec) \ + GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the " \ + "false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(float, hipsparseScsrsv2_analysis); +GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(double, hipsparseDcsrsv2_analysis); +GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(float, hipsparseScsrsv2_analysis); +GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(double, hipsparseDcsrsv2_analysis); +template +GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS(ValueType, detail::not_implemented); +#undef GKO_BIND_HIPSPARSE32_CSRSV2_ANALYSIS +#undef GKO_BIND_HIPSPARSE64_CSRSV2_ANALYSIS + +#define GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(ValueType, HipsparseName) \ + inline void csrsv2_solve( \ + hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \ + size_type nnz, const ValueType *one, const hipsparseMatDescr_t descr, \ + const ValueType *csrVal, const int32 *csrRowPtr, \ + const int32 *csrColInd, csrsv2Info_t factor_info, \ + const ValueType *rhs, ValueType *sol, hipsparseSolvePolicy_t policy, \ + void *factor_work_vec) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS( \ + HipsparseName(handle, trans, m, nnz, as_hiplibs_type(one), descr, \ + as_hiplibs_type(csrVal), csrRowPtr, csrColInd, \ + factor_info, as_hiplibs_type(rhs), \ + as_hiplibs_type(sol), policy, factor_work_vec)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(ValueType, HipsparseName) \ + inline void csrsv2_solve( \ + hipsparseHandle_t handle, hipsparseOperation_t trans, size_type m, \ + size_type nnz, const ValueType *one, const hipsparseMatDescr_t descr, \ + const ValueType *csrVal, const int64 *csrRowPtr, \ + const int64 *csrColInd, csrsv2Info_t factor_info, \ + const ValueType *rhs, ValueType *sol, hipsparseSolvePolicy_t policy, \ + void *factor_work_vec) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(float, hipsparseScsrsv2_solve); +GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(double, hipsparseDcsrsv2_solve); +GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(float, hipsparseScsrsv2_solve); +GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(double, hipsparseDcsrsv2_solve); +template +GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE(ValueType, detail::not_implemented); +#undef GKO_BIND_HIPSPARSE32_CSRSV2_SOLVE +#undef GKO_BIND_HIPSPARSE64_CSRSV2_SOLVE + + +inline hipsparseContext *init() +{ + hipsparseHandle_t handle{}; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreate(&handle)); + GKO_ASSERT_NO_HIPSPARSE_ERRORS( + hipsparseSetPointerMode(handle, HIPSPARSE_POINTER_MODE_DEVICE)); + return reinterpret_cast(handle); +} + + +inline void destroy_hipsparse_handle(hipsparseContext *handle) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS( + hipsparseDestroy(reinterpret_cast(handle))); +} + + +inline hipsparseMatDescr_t create_mat_descr() +{ + hipsparseMatDescr_t descr{}; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateMatDescr(&descr)); + return descr; +} + + +inline void destroy(hipsparseMatDescr_t descr) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyMatDescr(descr)); +} + + +inline csrgemm2Info_t create_spgemm_info() +{ + csrgemm2Info_t info{}; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrgemm2Info(&info)); + return info; +} + + +inline void destroy_spgemm_info(csrgemm2Info_t info) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyCsrgemm2Info(info)); +} + + +inline csrilu02Info_t create_ilu0_info() +{ + csrilu02Info_t info{}; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrilu02Info(&info)); + return info; +} + + +inline void destroy_ilu0_info(csrilu02Info_t info) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyCsrilu02Info(info)); +} + + +template +void create_identity_permutation(hipsparseHandle_t handle, IndexType size, + IndexType *permutation) GKO_NOT_IMPLEMENTED; + +template <> +inline void create_identity_permutation(hipsparseHandle_t handle, + int32 size, int32 *permutation) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS( + hipsparseCreateIdentityPermutation(handle, size, permutation)); +} + + +template +void csrsort_buffer_size(hipsparseHandle_t handle, IndexType m, IndexType n, + IndexType nnz, const IndexType *row_ptrs, + const IndexType *col_idxs, + size_type &buffer_size) GKO_NOT_IMPLEMENTED; + +template <> +inline void csrsort_buffer_size(hipsparseHandle_t handle, int32 m, + int32 n, int32 nnz, + const int32 *row_ptrs, + const int32 *col_idxs, + size_type &buffer_size) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseXcsrsort_bufferSizeExt( + handle, m, n, nnz, row_ptrs, col_idxs, &buffer_size)); +} + + +template +void csrsort(hipsparseHandle_t handle, IndexType m, IndexType n, IndexType nnz, + const hipsparseMatDescr_t descr, const IndexType *row_ptrs, + IndexType *col_idxs, IndexType *permutation, + void *buffer) GKO_NOT_IMPLEMENTED; + +template <> +inline void csrsort(hipsparseHandle_t handle, int32 m, int32 n, + int32 nnz, const hipsparseMatDescr_t descr, + const int32 *row_ptrs, int32 *col_idxs, + int32 *permutation, void *buffer) +{ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseXcsrsort( + handle, m, n, nnz, descr, row_ptrs, col_idxs, permutation, buffer)); +} + + +template +void gather(hipsparseHandle_t handle, IndexType nnz, const ValueType *in, + ValueType *out, const IndexType *permutation) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_HIPSPARSE_GATHER(ValueType, HipsparseName) \ + template <> \ + inline void gather(hipsparseHandle_t handle, int32 nnz, \ + const ValueType *in, ValueType *out, \ + const int32 *permutation) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, nnz, as_hiplibs_type(in), as_hiplibs_type(out), \ + permutation, HIPSPARSE_INDEX_BASE_ZERO)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_GATHER(float, hipsparseSgthr); +GKO_BIND_HIPSPARSE_GATHER(double, hipsparseDgthr); +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +GKO_BIND_HIPSPARSE_GATHER(std::complex, hipsparseCgthr); +GKO_BIND_HIPSPARSE_GATHER(std::complex, hipsparseZgthr); +#endif // hipsparse version >= 1.4 + +#undef GKO_BIND_HIPSPARSE_GATHER + + +template +void ilu0_buffer_size(hipsparseHandle_t handle, IndexType m, IndexType nnz, + const hipsparseMatDescr_t descr, const ValueType *vals, + const IndexType *row_ptrs, const IndexType *col_idxs, + csrilu02Info_t info, + size_type &buffer_size) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(ValueType, HipsparseName) \ + template <> \ + inline void ilu0_buffer_size( \ + hipsparseHandle_t handle, int32 m, int32 nnz, \ + const hipsparseMatDescr_t descr, const ValueType *vals, \ + const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info, \ + size_type &buffer_size) \ + { \ + int tmp_buffer_size{}; \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS( \ + HipsparseName(handle, m, nnz, descr, \ + as_hiplibs_type(const_cast(vals)), \ + row_ptrs, col_idxs, info, &tmp_buffer_size)); \ + buffer_size = tmp_buffer_size; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(float, hipsparseScsrilu02_bufferSize); +GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(double, hipsparseDcsrilu02_bufferSize); +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(std::complex, + hipsparseCcsrilu02_bufferSize); +GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE(std::complex, + hipsparseZcsrilu02_bufferSize); +#endif // hipsparse version >= 1.4 + +#undef GKO_BIND_HIPSPARSE_ILU0_BUFFER_SIZE + + +template +void ilu0_analysis(hipsparseHandle_t handle, IndexType m, IndexType nnz, + const hipsparseMatDescr_t descr, const ValueType *vals, + const IndexType *row_ptrs, const IndexType *col_idxs, + csrilu02Info_t info, hipsparseSolvePolicy_t policy, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(ValueType, HipsparseName) \ + template <> \ + inline void ilu0_analysis( \ + hipsparseHandle_t handle, int32 m, int32 nnz, \ + const hipsparseMatDescr_t descr, const ValueType *vals, \ + const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info, \ + hipsparseSolvePolicy_t policy, void *buffer) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS( \ + HipsparseName(handle, m, nnz, descr, as_hiplibs_type(vals), \ + row_ptrs, col_idxs, info, policy, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(float, hipsparseScsrilu02_analysis); +GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(double, hipsparseDcsrilu02_analysis); +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(std::complex, + hipsparseCcsrilu02_analysis); +GKO_BIND_HIPSPARSE_ILU0_ANALYSIS(std::complex, + hipsparseZcsrilu02_analysis); +#endif // hipsparse version >= 1.4 + +#undef GKO_BIND_HIPSPARSE_ILU0_ANALYSIS + + +template +void ilu0(hipsparseHandle_t handle, IndexType m, IndexType nnz, + const hipsparseMatDescr_t descr, ValueType *vals, + const IndexType *row_ptrs, const IndexType *col_idxs, + csrilu02Info_t info, hipsparseSolvePolicy_t policy, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_HIPSPARSE_ILU0(ValueType, HipsparseName) \ + template <> \ + inline void ilu0( \ + hipsparseHandle_t handle, int32 m, int32 nnz, \ + const hipsparseMatDescr_t descr, ValueType *vals, \ + const int32 *row_ptrs, const int32 *col_idxs, csrilu02Info_t info, \ + hipsparseSolvePolicy_t policy, void *buffer) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS( \ + HipsparseName(handle, m, nnz, descr, as_hiplibs_type(vals), \ + row_ptrs, col_idxs, info, policy, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE_ILU0(float, hipsparseScsrilu02); +GKO_BIND_HIPSPARSE_ILU0(double, hipsparseDcsrilu02); +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +GKO_BIND_HIPSPARSE_ILU0(std::complex, hipsparseCcsrilu02); +GKO_BIND_HIPSPARSE_ILU0(std::complex, hipsparseZcsrilu02); +#endif // hipsparse version >= 1.4 + +#undef GKO_BIND_HIPSPARSE_ILU0 + + +} // namespace hipsparse +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp new file mode 100644 index 00000000000..a80cc24f989 --- /dev/null +++ b/hip/base/math.hip.hpp @@ -0,0 +1,52 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_MATH_HIP_HPP_ +#define GKO_HIP_BASE_MATH_HIP_HPP_ + + +#include + + +#include + + +namespace gko { + + +#include "common/base/math.hpp.inc" + + +} // namespace gko + + +#endif // GKO_HIP_BASE_MATH_HIP_HPP_ diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp new file mode 100644 index 00000000000..f5601c5003a --- /dev/null +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -0,0 +1,156 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_POINTER_MODE_GUARD_HIP_HPP_ +#define GKO_HIP_BASE_POINTER_MODE_GUARD_HIP_HPP_ + + +#include + + +#include +#include +#include + + +#include +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace hipblas { + + +/** + * This class defines a pointer mode guard for the hip functions and the hip + * module. The guard is used to make sure that the correct pointer mode has been + * set when using scalars for the hipblas functions. The class records the + * current handle and sets the pointer mode to host for the current scope. After + * the scope has been exited, the destructor sets the pointer mode back to + * device. + */ +class pointer_mode_guard { +public: + pointer_mode_guard(hipblasContext *handle) + { + l_handle = handle; + GKO_ASSERT_NO_HIPBLAS_ERRORS( + hipblasSetPointerMode(reinterpret_cast(handle), + HIPBLAS_POINTER_MODE_HOST)); + } + + pointer_mode_guard(pointer_mode_guard &other) = delete; + + pointer_mode_guard &operator=(const pointer_mode_guard &other) = delete; + + pointer_mode_guard(pointer_mode_guard &&other) = delete; + + pointer_mode_guard const &operator=(pointer_mode_guard &&other) = delete; + + ~pointer_mode_guard() noexcept(false) + { + /* Ignore the error during stack unwinding for this call */ + if (std::uncaught_exception()) { + hipblasSetPointerMode(reinterpret_cast(l_handle), + HIPBLAS_POINTER_MODE_DEVICE); + } else { + GKO_ASSERT_NO_HIPBLAS_ERRORS(hipblasSetPointerMode( + reinterpret_cast(l_handle), + HIPBLAS_POINTER_MODE_DEVICE)); + } + } + +private: + hipblasContext *l_handle; +}; + + +} // namespace hipblas + + +namespace hipsparse { + + +/** + * This class defines a pointer mode guard for the hip functions and the hip + * module. The guard is used to make sure that the correct pointer mode has been + * set when using scalars for the hipsparse functions. The class records the + * current handle and sets the pointer mode to host for the current scope. After + * the scope has been exited, the destructor sets the pointer mode back to + * device. + */ +class pointer_mode_guard { +public: + pointer_mode_guard(hipsparseContext *handle) + { + l_handle = handle; + GKO_ASSERT_NO_HIPSPARSE_ERRORS( + hipsparseSetPointerMode(reinterpret_cast(handle), + HIPSPARSE_POINTER_MODE_HOST)); + } + + pointer_mode_guard(pointer_mode_guard &other) = delete; + + pointer_mode_guard &operator=(const pointer_mode_guard &other) = delete; + + pointer_mode_guard(pointer_mode_guard &&other) = delete; + + pointer_mode_guard const &operator=(pointer_mode_guard &&other) = delete; + + ~pointer_mode_guard() noexcept(false) + { + /* Ignore the error during stack unwinding for this call */ + if (std::uncaught_exception()) { + hipsparseSetPointerMode( + reinterpret_cast(l_handle), + HIPSPARSE_POINTER_MODE_DEVICE); + } else { + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetPointerMode( + reinterpret_cast(l_handle), + HIPSPARSE_POINTER_MODE_DEVICE)); + } + } + +private: + hipsparseContext *l_handle; +}; + + +} // namespace hipsparse +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_POINTER_MODE_GUARD_HIP_HPP_ diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp new file mode 100644 index 00000000000..11ed7c9d847 --- /dev/null +++ b/hip/base/types.hip.hpp @@ -0,0 +1,260 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_TYPES_HIP_HPP_ +#define GKO_HIP_BASE_TYPES_HIP_HPP_ + + +#include + + +#include +#include +#include +#include +#include + + +#include + + +namespace gko { + + +namespace kernels { +namespace hip { +namespace detail { + + +template +struct hiplibs_type_impl { + using type = T; +}; + +template +struct hiplibs_type_impl { + using type = typename hiplibs_type_impl::type *; +}; + +template +struct hiplibs_type_impl { + using type = typename hiplibs_type_impl::type &; +}; + +template +struct hiplibs_type_impl { + using type = const typename hiplibs_type_impl::type; +}; + +template +struct hiplibs_type_impl { + using type = volatile typename hiplibs_type_impl::type; +}; + +template <> +struct hiplibs_type_impl> { + using type = hipComplex; +}; + +template <> +struct hiplibs_type_impl> { + using type = hipDoubleComplex; +}; + +template +struct hiplibs_type_impl> { + using type = typename hiplibs_type_impl>::type; +}; + +template +struct hip_type_impl { + using type = T; +}; + +template +struct hip_type_impl { + using type = typename hip_type_impl::type *; +}; + +template +struct hip_type_impl { + using type = typename hip_type_impl::type &; +}; + +template +struct hip_type_impl { + using type = const typename hip_type_impl::type; +}; + +template +struct hip_type_impl { + using type = volatile typename hip_type_impl::type; +}; + +template +struct hip_type_impl> { + using type = thrust::complex; +}; + +template <> +struct hip_type_impl { + using type = thrust::complex; +}; + +template <> +struct hip_type_impl { + using type = thrust::complex; +}; + +template +constexpr hipblasDatatype_t hip_data_type_impl() +{ + return HIPBLAS_C_16F; +} + +template <> +constexpr hipblasDatatype_t hip_data_type_impl() +{ + return HIPBLAS_R_16F; +} + +template <> +constexpr hipblasDatatype_t hip_data_type_impl() +{ + return HIPBLAS_R_32F; +} + +template <> +constexpr hipblasDatatype_t hip_data_type_impl() +{ + return HIPBLAS_R_64F; +} + +template <> +constexpr hipblasDatatype_t hip_data_type_impl>() +{ + return HIPBLAS_C_32F; +} + +template <> +constexpr hipblasDatatype_t hip_data_type_impl>() +{ + return HIPBLAS_C_64F; +} + + +} // namespace detail + + +/** + * This is an alias for the `hipblasDataType_t` equivalent of `T`. By default, + * HIPBLAS_C_8U (which is unsupported by C++) is returned. + * + * @tparam T a type + * + * @returns the actual `hipblasDatatype_t` + */ +template +constexpr hipblasDatatype_t hip_data_type() +{ + return detail::hip_data_type_impl(); +} + + +/** + * This is an alias for HIP's equivalent of `T`. + * + * @tparam T a type + */ +template +using hip_type = typename detail::hip_type_impl::type; + + +/** + * Reinterprets the passed in value as a HIP type. + * + * @param val the value to reinterpret + * + * @return `val` reinterpreted to HIP type + */ +template +inline xstd::enable_if_t< + std::is_pointer::value || std::is_reference::value, hip_type> +as_hip_type(T val) +{ + return reinterpret_cast>(val); +} + + +/** + * @copydoc as_hip_type() + */ +template +inline xstd::enable_if_t< + !std::is_pointer::value && !std::is_reference::value, hip_type> +as_hip_type(T val) +{ + return *reinterpret_cast *>(&val); +} + + +/** + * This is an alias for equivalent of type T used in HIP libraries (HIPBLAS, + * HIPSPARSE, etc.). + * + * @tparam T a type + */ +template +using hiplibs_type = typename detail::hiplibs_type_impl::type; + + +/** + * Reinterprets the passed in value as an equivalent type used by the HIP + * libraries. + * + * @param val the value to reinterpret + * + * @return `val` reinterpreted to type used by HIP libraries + */ +template +inline hiplibs_type as_hiplibs_type(T val) +{ + return reinterpret_cast>(val); +} + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_TYPES_HIP_HPP_ diff --git a/hip/base/version.hip.cpp b/hip/base/version.hip.cpp new file mode 100644 index 00000000000..5c5473cbd55 --- /dev/null +++ b/hip/base/version.hip.cpp @@ -0,0 +1,48 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +namespace gko { + + +version version_info::get_hip_version() noexcept +{ + // When compiling the module, the header version is the same as the library + // version. Mismatch between the header and the module versions may happen + // if using shared libraries from different versions of Ginkgo. + return version_info::get_header_version(); +} + + +} // namespace gko diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp new file mode 100644 index 00000000000..c5ef42dba80 --- /dev/null +++ b/hip/components/atomic.hip.hpp @@ -0,0 +1,86 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ +#define GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/atomic.hpp.inc" + + +/** + * @internal + * + * @note It is not 'real' complex atomic add operation + */ +__forceinline__ __device__ thrust::complex atomic_add( + thrust::complex *__restrict__ address, thrust::complex val) +{ + hipComplex *addr = reinterpret_cast(address); + // Separate to real part and imag part + auto real = atomic_add(static_cast(&(addr->x)), val.real()); + auto imag = atomic_add(static_cast(&(addr->y)), val.imag()); + return {real, imag}; +} + + +/** + * @internal + * + * @note It is not 'real' complex atomic add operation + */ +__forceinline__ __device__ thrust::complex atomic_add( + thrust::complex *__restrict__ address, thrust::complex val) +{ + hipDoubleComplex *addr = reinterpret_cast(address); + // Separate to real part and imag part + auto real = atomic_add(static_cast(&(addr->x)), val.real()); + auto imag = atomic_add(static_cast(&(addr->y)), val.imag()); + return {real, imag}; +} + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp new file mode 100644 index 00000000000..a893479ec98 --- /dev/null +++ b/hip/components/cooperative_groups.hip.hpp @@ -0,0 +1,511 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_COOPERATIVE_GROUPS_HIP_HPP_ +#define GKO_HIP_COMPONENTS_COOPERATIVE_GROUPS_HIP_HPP_ + + +#include + + +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +/** + * Ginkgo uses cooperative groups to handle communication among the threads. + * + * However, HIP's implementation of cooperative groups is still quite limited + * in functionality, and some parts are not supported on all hardware + * interesting for Ginkgo. For this reason, Ginkgo exposes only a part of the + * original functionality, and possibly extends it if it is required. Thus, + * developers should include and use this header and the gko::group namespace + * instead of the standard cooperative_groups.h header. The interface exposed + * by Ginkgo's implementation is equivalent to the standard interface, with some + * useful extensions. + * + * A cooperative group (both from standard HIP and from Ginkgo) is not a + * specific type, but a concept. That is, any type satisfying the interface + * imposed by the cooperative groups API is considered a cooperative + * group (a.k.a. "duck typing"). To maximize the generality of components that + * need cooperative groups, instead of creating the group manually, consider + * requesting one as an input parameter. Make sure its type is a template + * parameter to maximize the set of groups for which your algorithm can be + * invoked. To maximize the amount of contexts in which your algorithm can be + * called and avoid hidden requirements, do not depend on a specific setup of + * kernel launch parameters (i.e. grid dimensions and block dimensions). + * Instead, use the thread_rank() method of the group to distinguish between + * distinct threads of a group. + * + * The original HIP implementation does not provide ways to verify if a certain + * type represents a cooperative group. Ginkgo's implementation provides + * metafunctions which do that. Additionally, not all cooperative groups have + * equivalent functionality, so Ginkgo splits the cooperative group concept into + * three sub-concepts which describe what functionality is available. Here is a + * list of concepts and their interfaces: + * + * ```c++ + * concept Group { + * unsigned size() const; + * unsigned thread_rank() const; + * }; + * + * concept SynchronizableGroup : Group { + * void sync(); + * }; + * + * concept CommunicatorGroup : SynchronizableGroup { + * template + * T shfl(T var, int srcLane); + * T shfl_up(T var, unsigned delta); + * T shfl_down(T var, unsigned delta); + * T shfl_xor(T var, int laneMask); + * int all(int predicate); + * int any(int predicate); + * unsigned ballot(int predicate); + * }; + * ``` + * + * To check if a group T satisfies one of the concepts, one can use the + * metafunctions is_group::value, is_synchronizable_group::value and + * is_communicator_group::value. + * + * @note Please note that the current implementation of cooperative groups + * contains only a subset of functionalities provided by those APIs. If + * you need more functionality, please add the appropriate implementations + * to existing cooperative groups, or create new groups if the existing + * groups do not cover your use-case. For an example, see the + * enable_extended_shuffle mixin, which adds extended shuffles support + * to built-in HIP cooperative groups. + */ +namespace group { + + +// metafunctions + + +namespace detail { + + +template +struct is_group_impl : std::false_type {}; + + +template +struct is_synchronizable_group_impl : std::false_type {}; + + +template +struct is_communicator_group_impl : std::true_type {}; + +} // namespace detail + + +/** + * Check if T is a Group. + */ +template +using is_group = detail::is_group_impl>; + + +/** + * Check if T is a SynchronizableGroup. + */ +template +using is_synchronizable_group = + detail::is_synchronizable_group_impl>; + + +/** + * Check if T is a CommunicatorGroup. + */ +template +using is_communicator_group = + detail::is_communicator_group_impl>; + + +// types + + +namespace detail { + + +/** + * This is a limited implementation of the HIP thread_block_tile. + * `any` and `all` are only supported when the size is config::warp_size + * + */ +template +class thread_block_tile { + /** + * Mask with Size consecutive ones starting at the least significant bit. + */ + static constexpr auto lane_mask_base = ~config::lane_mask_type{} >> + (config::warp_size - Size); + +public: + __device__ thread_block_tile() : data_{Size, 0, 0, lane_mask_base} + { + auto tid = + unsigned(threadIdx.x + + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z)); + data_.rank = tid % Size; + data_.lane_offset = (tid % config::warp_size) / Size * Size; + data_.mask <<= data_.lane_offset; + } + + __device__ __forceinline__ unsigned thread_rank() const noexcept + { + return data_.rank; + } + + __device__ __forceinline__ unsigned size() const noexcept { return Size; } + + __device__ __forceinline__ void sync() const noexcept + { +#if GINKGO_HIP_PLATFORM_NVCC + __syncwarp(data_.mask); +#endif + } + +#if GINKGO_HIP_PLATFORM_HCC +#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType) \ + __device__ __forceinline__ ValueType ShflOp( \ + ValueType var, SelectorType selector) const noexcept \ + { \ + return __##ShflOp(var, selector, Size); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#else +#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType) \ + __device__ __forceinline__ ValueType ShflOp( \ + ValueType var, SelectorType selector) const noexcept \ + { \ + return __##ShflOp##_sync(data_.mask, var, selector, Size); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif + + GKO_BIND_SHFL(shfl, int32, int32); + GKO_BIND_SHFL(shfl, float, int32); + GKO_BIND_SHFL(shfl, uint32, int32); + GKO_BIND_SHFL(shfl, double, int32); + + GKO_BIND_SHFL(shfl_up, int32, uint32); + GKO_BIND_SHFL(shfl_up, uint32, uint32); + GKO_BIND_SHFL(shfl_up, float, uint32); + GKO_BIND_SHFL(shfl_up, double, uint32); + + GKO_BIND_SHFL(shfl_down, int32, uint32); + GKO_BIND_SHFL(shfl_down, uint32, uint32); + GKO_BIND_SHFL(shfl_down, float, uint32); + GKO_BIND_SHFL(shfl_down, double, uint32); + + GKO_BIND_SHFL(shfl_xor, int32, int32); + GKO_BIND_SHFL(shfl_xor, float, int32); + GKO_BIND_SHFL(shfl_xor, uint32, int32); + GKO_BIND_SHFL(shfl_xor, double, int32); + + /** + * Returns true iff the predicate is true for at least one threads in the + * group. Note that the whole group needs to execute the same operation. + */ + __device__ __forceinline__ int any(int predicate) const noexcept + { +#if GINKGO_HIP_PLATFORM_HCC + if (Size == config::warp_size) { + return __any(predicate); + } else { + return (__ballot(predicate) & data_.mask) != 0; + } +#else + return __any_sync(data_.mask, predicate); +#endif + } + + /** + * Returns true iff the predicate is true for all threads in the group. + * Note that the whole group needs to execute the same operation. + */ + __device__ __forceinline__ int all(int predicate) const noexcept + { +#if GINKGO_HIP_PLATFORM_HCC + if (Size == config::warp_size) { + return __all(predicate); + } else { + return (__ballot(predicate) & data_.mask) == data_.mask; + } +#else + return __all_sync(data_.mask, predicate); +#endif + } + + /** + * Returns a bitmask containing the value of the given predicate + * for all threads in the group. + * This means that the ith bit is equal to the predicate of the + * thread with thread_rank() == i in the group. + * Note that the whole group needs to execute the same operation. + */ + __device__ __forceinline__ config::lane_mask_type ballot( + int predicate) const noexcept + { +#if GINKGO_HIP_PLATFORM_HCC + if (Size == config::warp_size) { + return __ballot(predicate); + } else { + return (__ballot(predicate) & data_.mask) >> data_.lane_offset; + } +#else + if (Size == config::warp_size) { + return __ballot_sync(data_.mask, predicate); + } else { + return __ballot_sync(data_.mask, predicate) >> data_.lane_offset; + } +#endif + } + +private: + struct alignas(8) { + unsigned size; + unsigned rank; + unsigned lane_offset; + config::lane_mask_type mask; + } data_; +}; + + +} // namespace detail + + +namespace detail { + + +// Adds generalized shuffles that support any type to the group. +template +class enable_extended_shuffle : public Group { +public: + using Group::Group; + using Group::shfl; + using Group::shfl_down; + using Group::shfl_up; + using Group::shfl_xor; + +#define GKO_ENABLE_SHUFFLE_OPERATION(_name, SelectorType) \ + template \ + __device__ __forceinline__ ValueType _name(const ValueType &var, \ + SelectorType selector) const \ + { \ + return shuffle_impl( \ + [this](uint32 v, SelectorType s) { \ + return static_cast(this)->_name(v, s); \ + }, \ + var, selector); \ + } + + GKO_ENABLE_SHUFFLE_OPERATION(shfl, int32) + GKO_ENABLE_SHUFFLE_OPERATION(shfl_up, uint32) + GKO_ENABLE_SHUFFLE_OPERATION(shfl_down, uint32) + GKO_ENABLE_SHUFFLE_OPERATION(shfl_xor, int32) + +#undef GKO_ENABLE_SHUFFLE_OPERATION + +private: + template + static __device__ __forceinline__ ValueType + shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, + SelectorType selector) + { + static_assert(sizeof(ValueType) % sizeof(uint32) == 0, + "Unable to shuffle sizes which are not 4-byte multiples"); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + ValueType result; + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); +#pragma unroll + for (std::size_t i = 0; i < value_size; ++i) { + result_array[i] = intrinsic_shuffle(var_array[i], selector); + } + return result; + } +}; + + +} // namespace detail + + +// Implementing this as a using directive messes up with SFINAE for some reason, +// probably a bug in NVCC. If it is a complete type, everything works fine. +template +struct thread_block_tile + : detail::enable_extended_shuffle> { + using detail::enable_extended_shuffle< + detail::thread_block_tile>::enable_extended_shuffle; +}; + + +// Only support tile_partition with 1, 2, 4, 8, 16, 32, 64 (hip). +template +__device__ __forceinline__ gko::xstd::enable_if_t< + (Size <= kernels::hip::config::warp_size) && (Size > 0) && + (kernels::hip::config::warp_size % Size == 0), + thread_block_tile> +tiled_partition(const Group &) +{ + return thread_block_tile(); +} + + +namespace detail { + + +template +struct is_group_impl> : std::true_type {}; +template +struct is_synchronizable_group_impl> : std::true_type { +}; +template +struct is_communicator_group_impl> : std::true_type {}; + + +} // namespace detail + + +class thread_block { + friend __device__ __forceinline__ thread_block this_thread_block(); + +public: + __device__ __forceinline__ unsigned thread_rank() const noexcept + { + return data_.rank; + } + + __device__ __forceinline__ unsigned size() const noexcept + { + return data_.size; + } + + __device__ __forceinline__ void sync() const noexcept { __syncthreads(); } + +private: + __device__ thread_block() + : data_{static_cast(blockDim.x * blockDim.y * blockDim.z), + static_cast( + threadIdx.x + + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z))} + {} + struct alignas(8) { + unsigned size; + unsigned rank; + } data_; +}; + + +__device__ __forceinline__ thread_block this_thread_block() +{ + return thread_block(); +} + + +namespace detail { + +template <> +struct is_group_impl : std::true_type {}; +template <> +struct is_synchronizable_group_impl : std::true_type {}; + + +} // namespace detail + + +/** + * This is a limited implementation of the CUDA grid_group that works even on + * devices that do not support device-wide synchronization and without special + * kernel launch syntax. + * + * Note that this implementation (as well as the one from CUDA's cooperative + * groups) does not support large grids, since it uses 32 bits to represent + * sizes and ranks, while at least 73 bits (63 bit grid + 10 bit block) would + * have to be used to represent the full space of thread ranks. + */ +class grid_group { + friend __device__ grid_group this_grid(); + +public: + __device__ unsigned size() const noexcept { return data_.size; } + + __device__ unsigned thread_rank() const noexcept { return data_.rank; } + +private: + // clang-format off + __device__ grid_group() + : data_{ + blockDim.x * blockDim.y * blockDim.z * + gridDim.x * gridDim.y * gridDim.z, + threadIdx.x + blockDim.x * + (threadIdx.y + blockDim.y * + (threadIdx.z + blockDim.z * + (blockIdx.x + gridDim.x * + (blockIdx.y + gridDim.y * blockIdx.z))))} + {} + // clang-format on + + struct alignas(8) { + unsigned size; + unsigned rank; + } data_; +}; + +// Not using this, as grid_group is not universally supported. +// grid_group this_grid() +// using cooperative_groups::this_grid; +// Instead, use our limited implementation: +__device__ inline grid_group this_grid() { return {}; } + + +} // namespace group +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_COOPERATIVE_GROUPS_HIP_HPP_ diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp new file mode 100644 index 00000000000..729e5c3336c --- /dev/null +++ b/hip/components/diagonal_block_manipulation.hip.hpp @@ -0,0 +1,57 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ +#define GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ + + +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace csr { + + +#include "common/components/diagonal_block_manipulation.hpp.inc" + + +} // namespace csr +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ diff --git a/hip/components/fill_array.hip.cpp b/hip/components/fill_array.hip.cpp new file mode 100644 index 00000000000..e738a68811e --- /dev/null +++ b/hip/components/fill_array.hip.cpp @@ -0,0 +1,76 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/fill_array.hpp" + + +#include + + +#include + + +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace components { + +constexpr int default_block_size = 512; + + +#include "common/components/fill_array.hpp.inc" + + +template +void fill_array(std::shared_ptr exec, ValueType *array, + size_type n, ValueType val) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size(ceildiv(n, block_size.x), 1, 1); + hipLaunchKernelGGL(kernel::fill_array, dim3(grid_size), dim3(block_size), 0, + 0, n, as_hip_type(array), as_hip_type(val)); +} + + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type); + + +} // namespace components +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp new file mode 100644 index 00000000000..1c731862be8 --- /dev/null +++ b/hip/components/format_conversion.hip.hpp @@ -0,0 +1,133 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_ +#define GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_ + + +#include + + +#include +#include + + +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace ell { +namespace kernel { + + +/** + * @internal + * + * It counts the number of explicit nonzeros per row of Ell. + */ +template +__global__ void count_nnz_per_row(size_type num_rows, size_type max_nnz_per_row, + size_type stride, + const ValueType *__restrict__ values, + IndexType *__restrict__ result); + + +} // namespace kernel +} // namespace ell + + +namespace coo { +namespace kernel { + + +/** + * @internal + * + * It converts the row index of Coo to the row pointer of Csr. + */ +template +__global__ void convert_row_idxs_to_ptrs(const IndexType *__restrict__ idxs, + size_type num_nonzeros, + IndexType *__restrict__ ptrs, + size_type length); + + +} // namespace kernel + + +namespace host_kernel { + + +/** + * @internal + * + * It calculates the number of warps used in Coo Spmv depending on the GPU + * architecture and the number of stored elements. + */ +template +__host__ size_type calculate_nwarps(std::shared_ptr exec, + const size_type nnz) +{ + size_type nwarps_in_hip = exec->get_num_multiprocessor() * + exec->get_num_warps_per_sm() * config::warp_size / + subwarp_size; +#if GINKGO_HIP_PLATFORM_NVCC + size_type multiple = 8; + if (nnz >= 2e6) { + multiple = 128; + } else if (nnz >= 2e5) { + multiple = 32; + } +#else + size_type multiple = 2; + if (nnz >= 1e7) { + multiple = 32; + } else if (nnz >= 1e5) { + multiple = 8; + } +#endif // GINKGO_HIP_PLATFORM_NVCC + return std::min(multiple * nwarps_in_hip, + size_type(ceildiv(nnz, config::warp_size))); +} + + +} // namespace host_kernel +} // namespace coo +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_ diff --git a/hip/components/intrinsics.hip.hpp b/hip/components/intrinsics.hip.hpp new file mode 100644 index 00000000000..8d9d0579013 --- /dev/null +++ b/hip/components/intrinsics.hip.hpp @@ -0,0 +1,53 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ +#define GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/intrinsics.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp new file mode 100644 index 00000000000..30289d41ed2 --- /dev/null +++ b/hip/components/merging.hip.hpp @@ -0,0 +1,56 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ +#define GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ + + +#include "core/base/utils.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/searching.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/merging.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ diff --git a/hip/components/precision_conversion.hip.cpp b/hip/components/precision_conversion.hip.cpp new file mode 100644 index 00000000000..6720cf8c92b --- /dev/null +++ b/hip/components/precision_conversion.hip.cpp @@ -0,0 +1,68 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/precision_conversion.hpp" + + +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace components { + + +constexpr int default_block_size = 512; + + +#include "common/components/precision_conversion.hpp.inc" + + +template +void convert_precision(std::shared_ptr exec, + size_type size, const SourceType *in, TargetType *out) +{ + auto num_blocks = ceildiv(size, default_block_size); + hipLaunchKernelGGL(HIP_KERNEL_NAME(convert_precision), num_blocks, + default_block_size, 0, 0, size, as_hip_type(in), + as_hip_type(out)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); + + +} // namespace components +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp new file mode 100644 index 00000000000..2fe526cdb2d --- /dev/null +++ b/hip/components/prefix_sum.hip.cpp @@ -0,0 +1,73 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include "hip/components/prefix_sum.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace components { + + +constexpr int prefix_sum_block_size = 512; + + +template +void prefix_sum(std::shared_ptr exec, IndexType *counts, + size_type num_entries) +{ + auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); + Array block_sum_array(exec, num_blocks); + auto block_sums = block_sum_array.get_data(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum), + dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, + num_entries, counts, block_sums); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(finalize_prefix_sum), + dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries, + counts, block_sums); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); + +// instantiate for size_type as well, as this is used in the Sellp format +template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); + + +} // namespace components +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp new file mode 100644 index 00000000000..0f1059f964e --- /dev/null +++ b/hip/components/prefix_sum.hip.hpp @@ -0,0 +1,59 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ +#define GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ + + +#include + + +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/prefix_sum.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp new file mode 100644 index 00000000000..953cead968f --- /dev/null +++ b/hip/components/reduction.hip.hpp @@ -0,0 +1,105 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ +#define GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ + + +#include + + +#include +#include +#include + + +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +constexpr int default_block_size = 512; + + +#include "common/components/reduction.hpp.inc" + + +/** + * Compute a reduction using add operation (+). + * + * @param exec Executor associated to the array + * @param size size of the array + * @param source the pointer of the array + * + * @return the reduction result + */ +template +__host__ ValueType reduce_add_array(std::shared_ptr exec, + size_type size, const ValueType *source) +{ + auto block_results_val = source; + size_type grid_dim = size; + if (size > default_block_size) { + const auto n = ceildiv(size, default_block_size); + grid_dim = (n <= default_block_size) ? n : default_block_size; + + auto block_results = Array(exec, grid_dim); + + hipLaunchKernelGGL( + reduce_add_array, dim3(grid_dim), dim3(default_block_size), 0, 0, + size, as_hip_type(source), as_hip_type(block_results.get_data())); + + block_results_val = block_results.get_const_data(); + } + + auto d_result = Array(exec, 1); + + hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0, + 0, grid_dim, as_hip_type(block_results_val), + as_hip_type(d_result.get_data())); + auto answer = exec->copy_val_to_host(d_result.get_const_data()); + return answer; +} + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp new file mode 100644 index 00000000000..7611b23fdee --- /dev/null +++ b/hip/components/searching.hip.hpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ +#define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ + + +#include "hip/base/config.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/searching.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp new file mode 100644 index 00000000000..8733778f7e4 --- /dev/null +++ b/hip/components/segment_scan.hip.hpp @@ -0,0 +1,56 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ +#define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ + + +#include + + +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/segment_scan.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp new file mode 100644 index 00000000000..704c8f9dd07 --- /dev/null +++ b/hip/components/sorting.hip.hpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ +#define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ + + +#include "hip/base/config.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/sorting.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp new file mode 100644 index 00000000000..6016c26cf68 --- /dev/null +++ b/hip/components/thread_ids.hip.hpp @@ -0,0 +1,60 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ +#define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ + + +#include "hip/base/config.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The HIP thread namespace. + * + * @ingroup hip_thread + */ +namespace thread { + + +#include "common/components/thread_ids.hpp.inc" + + +} // namespace thread +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ diff --git a/hip/components/uninitialized_array.hip.hpp b/hip/components/uninitialized_array.hip.hpp new file mode 100644 index 00000000000..7780ebb10f5 --- /dev/null +++ b/hip/components/uninitialized_array.hip.hpp @@ -0,0 +1,53 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ +#define GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/uninitialized_array.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp new file mode 100644 index 00000000000..ee2abc649e1 --- /dev/null +++ b/hip/components/warp_blas.hip.hpp @@ -0,0 +1,60 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ +#define GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ + + +#include + + +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/components/reduction.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +#include "common/components/warp_blas.hpp.inc" + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp new file mode 100644 index 00000000000..7a0f0a4f607 --- /dev/null +++ b/hip/factorization/factorization_kernels.hip.cpp @@ -0,0 +1,261 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/factorization_kernels.hpp" + + +#include + + +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/searching.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +constexpr int default_block_size{512}; + + +#include "common/factorization/factorization_kernels.hpp.inc" + + +template +void add_diagonal_elements(std::shared_ptr exec, + matrix::Csr *mtx, + bool is_sorted) +{ + // TODO: Runtime can be optimized by choosing a appropriate size for the + // subwarp dependent on the matrix properties + constexpr int subwarp_size = config::warp_size; + auto mtx_size = mtx->get_size(); + auto num_rows = static_cast(mtx_size[0]); + auto num_cols = static_cast(mtx_size[1]); + size_type row_ptrs_size = num_rows + 1; + + Array row_ptrs_addition(exec, row_ptrs_size); + Array needs_change_host{exec->get_master(), 1}; + needs_change_host.get_data()[0] = false; + Array needs_change_device{exec, 1}; + needs_change_device = needs_change_host; + + auto hip_old_values = as_hip_type(mtx->get_const_values()); + auto hip_old_col_idxs = as_hip_type(mtx->get_const_col_idxs()); + auto hip_old_row_ptrs = as_hip_type(mtx->get_row_ptrs()); + auto hip_row_ptrs_add = as_hip_type(row_ptrs_addition.get_data()); + + const dim3 block_dim{default_block_size, 1, 1}; + const dim3 grid_dim{ + static_cast(ceildiv(num_rows, block_dim.x / subwarp_size)), 1, + 1}; + if (is_sorted) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::find_missing_diagonal_elements), + grid_dim, block_dim, 0, 0, num_rows, num_cols, hip_old_col_idxs, + hip_old_row_ptrs, hip_row_ptrs_add, + as_hip_type(needs_change_device.get_data())); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::find_missing_diagonal_elements), + grid_dim, block_dim, 0, 0, num_rows, num_cols, hip_old_col_idxs, + hip_old_row_ptrs, hip_row_ptrs_add, + as_hip_type(needs_change_device.get_data())); + } + needs_change_host = needs_change_device; + if (!needs_change_host.get_const_data()[0]) { + return; + } + + components::prefix_sum(exec, hip_row_ptrs_add, row_ptrs_size); + exec->synchronize(); + + auto total_additions = + exec->copy_val_to_host(hip_row_ptrs_add + row_ptrs_size - 1); + size_type new_num_elems = static_cast(total_additions) + + mtx->get_num_stored_elements(); + + + Array new_values{exec, new_num_elems}; + Array new_col_idxs{exec, new_num_elems}; + auto hip_new_values = as_hip_type(new_values.get_data()); + auto hip_new_col_idxs = as_hip_type(new_col_idxs.get_data()); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::add_missing_diagonal_elements), + grid_dim, block_dim, 0, 0, num_rows, hip_old_values, hip_old_col_idxs, + hip_old_row_ptrs, hip_new_values, hip_new_col_idxs, hip_row_ptrs_add); + + const dim3 grid_dim_row_ptrs_update{ + static_cast(ceildiv(num_rows, block_dim.x)), 1, 1}; + hipLaunchKernelGGL(kernel::update_row_ptrs, grid_dim_row_ptrs_update, + block_dim, 0, 0, num_rows + 1, hip_old_row_ptrs, + hip_row_ptrs_add); + + matrix::CsrBuilder mtx_builder{mtx}; + mtx_builder.get_value_array() = std::move(new_values); + mtx_builder.get_col_idx_array() = std::move(new_col_idxs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); + + +template +void initialize_row_ptrs_l_u( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs, IndexType *u_row_ptrs) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + + const dim3 block_size{default_block_size, 1, 1}; + const uint32 number_blocks = + ceildiv(num_rows, static_cast(block_size.x)); + const dim3 grid_dim{number_blocks, 1, 1}; + + hipLaunchKernelGGL(kernel::count_nnz_per_l_u_row, dim3(grid_dim), + dim3(block_size), 0, 0, num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(l_row_ptrs), as_hip_type(u_row_ptrs)); + + components::prefix_sum(exec, l_row_ptrs, num_rows + 1); + components::prefix_sum(exec, u_row_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); + + +template +void initialize_l_u(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, + matrix::Csr *csr_u) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv( + num_rows, static_cast(block_size.x))), + 1, 1}; + + hipLaunchKernelGGL( + kernel::initialize_l_u, dim3(grid_dim), dim3(block_size), 0, 0, + num_rows, as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(csr_l->get_const_row_ptrs()), + as_hip_type(csr_l->get_col_idxs()), as_hip_type(csr_l->get_values()), + as_hip_type(csr_u->get_const_row_ptrs()), + as_hip_type(csr_u->get_col_idxs()), as_hip_type(csr_u->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); + + +template +void initialize_row_ptrs_l( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + + const dim3 block_size{default_block_size, 1, 1}; + const uint32 number_blocks = + ceildiv(num_rows, static_cast(block_size.x)); + const dim3 grid_dim{number_blocks, 1, 1}; + + hipLaunchKernelGGL(kernel::count_nnz_per_l_row, dim3(grid_dim), + dim3(block_size), 0, 0, num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(l_row_ptrs)); + + components::prefix_sum(exec, l_row_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); + + +template +void initialize_l(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, bool diag_sqrt) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv( + num_rows, static_cast(block_size.x))), + 1, 1}; + + hipLaunchKernelGGL(kernel::initialize_l, dim3(grid_dim), dim3(block_size), + 0, 0, num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(csr_l->get_const_row_ptrs()), + as_hip_type(csr_l->get_col_idxs()), + as_hip_type(csr_l->get_values()), diag_sqrt); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp new file mode 100644 index 00000000000..8888856e898 --- /dev/null +++ b/hip/factorization/ilu_kernels.hip.cpp @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/ilu_kernels.hpp" + + +#include + + +#include + + +#include "hip/base/device_guard.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The ilu factorization namespace. + * + * @ingroup factor + */ +namespace ilu_factorization { + + +template +void compute_lu(std::shared_ptr exec, + matrix::Csr *m) +{ + const auto id = exec->get_device_id(); + auto handle = exec->get_hipsparse_handle(); + gko::hip::device_guard g{id}; + auto desc = hipsparse::create_mat_descr(); + auto info = hipsparse::create_ilu0_info(); + + // get buffer size for ILU + IndexType num_rows = m->get_size()[0]; + IndexType nnz = m->get_num_stored_elements(); + size_type buffer_size{}; + hipsparse::ilu0_buffer_size(handle, num_rows, nnz, desc, + m->get_const_values(), m->get_const_row_ptrs(), + m->get_const_col_idxs(), info, buffer_size); + + Array buffer{exec, buffer_size}; + + // set up ILU(0) + hipsparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), + info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL, + buffer.get_data()); + + hipsparse::ilu0(handle, num_rows, nnz, desc, m->get_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), info, + HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + + hipsparse::destroy_ilu0_info(info); + hipsparse::destroy(desc); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ILU_COMPUTE_LU_KERNEL); + + +} // namespace ilu_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp new file mode 100644 index 00000000000..d987ff36856 --- /dev/null +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -0,0 +1,213 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/searching.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ict_spgeam_kernels.hpp.inc" +#include "common/factorization/par_ict_sweep_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *llt, + const matrix::Csr *a, + const matrix::Csr *l, + matrix::Csr *l_new) +{ + auto num_rows = static_cast(llt->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + auto llt_row_ptrs = llt->get_const_row_ptrs(); + auto llt_col_idxs = llt->get_const_col_idxs(); + auto llt_vals = llt->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + // count non-zeros per row + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::ict_tri_spgeam_nnz), + dim3(num_blocks), dim3(default_block_size), 0, 0, llt_row_ptrs, + llt_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, num_rows); + + // build row ptrs + components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // fill columns and values + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::ict_tri_spgeam_init), + dim3(num_blocks), dim3(default_block_size), 0, 0, llt_row_ptrs, + llt_col_idxs, as_hip_type(llt_vals), a_row_ptrs, a_col_idxs, + as_hip_type(a_vals), l_row_ptrs, l_col_idxs, as_hip_type(l_vals), + l_new_row_ptrs, l_new_col_idxs, as_hip_type(l_new_vals), num_rows); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +template +void compute_factor(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo) +{ + auto total_nnz = static_cast(l->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::ict_sweep), + dim3(num_blocks), dim3(default_block_size), 0, 0, + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_hip_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_hip_type(l->get_values()), + static_cast(l->get_num_stored_elements())); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *llt, + const matrix::Csr *a, + const matrix::Csr *l, + matrix::Csr *l_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + llt->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, llt, a, l, l_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = 2 * l->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_factor( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp new file mode 100644 index 00000000000..d8caeb90d16 --- /dev/null +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -0,0 +1,101 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilu_kernels.hpp" + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ilu factorization namespace. + * + * @ingroup factor + */ +namespace par_ilu_factorization { + + +constexpr int default_block_size{512}; + + +#include "common/factorization/par_ilu_kernels.hpp.inc" + + +template +void compute_l_u_factors(std::shared_ptr exec, + size_type iterations, + const matrix::Coo *system_matrix, + matrix::Csr *l_factor, + matrix::Csr *u_factor) +{ + iterations = (iterations == 0) ? 10 : iterations; + const auto num_elements = system_matrix->get_num_stored_elements(); + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{ + static_cast( + ceildiv(num_elements, static_cast(block_size.x))), + 1, 1}; + for (size_type i = 0; i < iterations; ++i) { + hipLaunchKernelGGL(kernel::compute_l_u_factors, dim3(grid_dim), + dim3(block_size), 0, 0, num_elements, + as_hip_type(system_matrix->get_const_row_idxs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(l_factor->get_const_row_ptrs()), + as_hip_type(l_factor->get_const_col_idxs()), + as_hip_type(l_factor->get_values()), + as_hip_type(u_factor->get_const_row_ptrs()), + as_hip_type(u_factor->get_const_col_idxs()), + as_hip_type(u_factor->get_values())); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); + + +} // namespace par_ilu_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp new file mode 100644 index 00000000000..e4cd1d6bff7 --- /dev/null +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -0,0 +1,211 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/factorization/par_ilut_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" +#include "hip/components/sorting.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/factorization/par_ilut_select_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_filter_kernels.hpp.inc" +#include "common/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void threshold_filter_approx(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array *tmp, + remove_complex *threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp->resize_and_reset(tmp_size); + + auto total_counts = reinterpret_cast(tmp->get_data()); + auto partial_counts = + reinterpret_cast(tmp->get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp->get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp->get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank + auto bucket = static_cast( + sampleselect_find_bucket(exec, total_counts, rank).idx); + *threshold = + exec->copy_val_to_host(tree + kernel::searchtree_inner_size + bucket); + // we implicitly set the first splitter to -inf, but 0 works as well + if (bucket == 0) { + *threshold = zero(); + } + + // filter the elements + auto old_row_ptrs = m->get_const_row_ptrs(); + auto old_col_idxs = m->get_const_col_idxs(); + auto old_vals = m->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(m->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter_nnz), + dim3(num_blocks), dim3(default_block_size), 0, 0, + old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); + + // build row pointers + components::prefix_sum(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType *new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + Array::view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + Array::view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter), + dim3(num_blocks), dim3(default_block_size), 0, 0, + old_row_ptrs, old_col_idxs, as_hip_type(old_vals), + oracles, num_rows, bucket, new_row_ptrs, new_row_idxs, + new_col_idxs, as_hip_type(new_vals)); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx, + threshold_filter_approx); + + +template +void threshold_filter_approx(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp, + remove_complex &threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo) +{ + auto num_rows = m->get_size()[0]; + auto total_nnz = m->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter_approx( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, + &threshold, m_out, m_out_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp new file mode 100644 index 00000000000..f1b57bd9f32 --- /dev/null +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -0,0 +1,166 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_filter_kernels.hpp.inc" + + +namespace { + + +template +void threshold_filter(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + remove_complex threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, bool lower) +{ + auto old_row_ptrs = a->get_const_row_ptrs(); + auto old_col_idxs = a->get_const_col_idxs(); + auto old_vals = a->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(a->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::threshold_filter_nnz), + dim3(num_blocks), dim3(default_block_size), 0, 0, old_row_ptrs, + as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower); + + // build row pointers + components::prefix_sum(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType *new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + Array::view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + Array::view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::threshold_filter), + dim3(num_blocks), dim3(default_block_size), 0, 0, + old_row_ptrs, old_col_idxs, as_hip_type(old_vals), + num_rows, threshold, new_row_ptrs, new_row_idxs, + new_col_idxs, as_hip_type(new_vals), lower); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); + + +} // namespace + +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr *a, + remove_complex threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, bool lower) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, + m_out_coo, lower); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp new file mode 100644 index 00000000000..ce3d65876b3 --- /dev/null +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -0,0 +1,127 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// prevent compilation failure related to disappearing assert(...) statements +#include +// force-top: off + + +#include "hip/factorization/par_ilut_select_common.hip.hpp" + + +#include "core/factorization/par_ilut_kernels.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" +#include "hip/components/searching.hip.hpp" +#include "hip/components/sorting.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +#include "common/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void sampleselect_count(const ValueType *values, IndexType size, + remove_complex *tree, unsigned char *oracles, + IndexType *partial_counts, IndexType *total_counts) +{ + constexpr auto bucket_count = kernel::searchtree_width; + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + // pick sample, build searchtree + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), dim3(1), + dim3(bucket_count), 0, 0, as_hip_type(values), size, + tree); + // determine bucket sizes + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), dim3(num_blocks), + dim3(default_block_size), 0, 0, as_hip_type(values), + size, tree, partial_counts, oracles, items_per_thread); + // compute prefix sum and total sum over block-local values + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), + dim3(bucket_count), dim3(default_block_size), 0, 0, + partial_counts, total_counts, num_blocks); + // compute prefix sum over bucket counts + hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum), dim3(1), + dim3(bucket_count), 0, 0, bucket_count, total_counts, + total_counts + bucket_count); +} + + +#define DECLARE_SSSS_COUNT(ValueType, IndexType) \ + void sampleselect_count(const ValueType *values, IndexType size, \ + remove_complex *tree, \ + unsigned char *oracles, IndexType *partial_counts, \ + IndexType *total_counts) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT); + + +template +sampleselect_bucket sampleselect_find_bucket( + std::shared_ptr exec, IndexType *prefix_sum, + IndexType rank) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::find_bucket), dim3(1), + dim3(config::warp_size), 0, 0, prefix_sum, rank); + IndexType values[3]{}; + exec->get_master()->copy_from(exec.get(), 3, prefix_sum, values); + return {values[0], values[1], values[2]}; +} + + +#define DECLARE_SSSS_FIND_BUCKET(IndexType) \ + sampleselect_bucket sampleselect_find_bucket( \ + std::shared_ptr exec, IndexType *prefix_sum, \ + IndexType rank) + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/hip/factorization/par_ilut_select_common.hip.hpp b/hip/factorization/par_ilut_select_common.hip.hpp new file mode 100644 index 00000000000..0758eaa2eaf --- /dev/null +++ b/hip/factorization/par_ilut_select_common.hip.hpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_ +#define GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_ + + +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; +constexpr auto items_per_thread = 16; + + +template +void sampleselect_count(const ValueType *values, IndexType size, + remove_complex *tree, unsigned char *oracles, + IndexType *partial_counts, IndexType *total_counts); + + +template +struct sampleselect_bucket { + IndexType idx; + IndexType begin; + IndexType size; +}; + + +template +sampleselect_bucket sampleselect_find_bucket( + std::shared_ptr exec, IndexType *prefix_sum, + IndexType rank); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_ \ No newline at end of file diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp new file mode 100644 index 00000000000..6916344f2bc --- /dev/null +++ b/hip/factorization/par_ilut_select_kernel.hip.cpp @@ -0,0 +1,189 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" +#include "hip/components/searching.hip.hpp" +#include "hip/components/sorting.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/factorization/par_ilut_select_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +#include "common/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void sampleselect_filter(const ValueType *values, IndexType size, + const unsigned char *oracles, + const IndexType *partial_counts, IndexType bucket, + remove_complex *out) +{ + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), dim3(num_blocks), + dim3(default_block_size), 0, 0, as_hip_type(values), + size, bucket, oracles, partial_counts, out, + items_per_thread); +} + + +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp1, + Array> &tmp2, + remove_complex &threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size_vals = + size / bucket_count * 4; // pessimistic estimate for temporary storage + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp1.resize_and_reset(tmp_size); + tmp2.resize_and_reset(tmp_size_vals); + + auto total_counts = reinterpret_cast(tmp1.get_data()); + auto partial_counts = + reinterpret_cast(tmp1.get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp1.get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp1.get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank, use bucket-local rank + auto bucket = sampleselect_find_bucket(exec, total_counts, rank); + rank -= bucket.begin; + + if (bucket.size * 2 > tmp_size_vals) { + // we need to reallocate tmp2 + tmp2.resize_and_reset(bucket.size * 2); + } + auto tmp21 = tmp2.get_data(); + auto tmp22 = tmp2.get_data() + bucket.size; + // extract target bucket + sampleselect_filter(values, size, oracles, partial_counts, bucket.idx, + tmp22); + + // recursively select from smaller buckets + int step{}; + while (bucket.size > kernel::basecase_size) { + std::swap(tmp21, tmp22); + const auto *tmp_in = tmp21; + auto tmp_out = tmp22; + + sampleselect_count(tmp_in, bucket.size, tree, oracles, partial_counts, + total_counts); + auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); + sampleselect_filter(tmp_in, bucket.size, oracles, partial_counts, + bucket.idx, tmp_out); + + rank -= new_bucket.begin; + bucket.size = new_bucket.size; + // we should never need more than 5 recursion steps, this would mean + // 256^5 = 2^40. fall back to standard library algorithm in that case. + ++step; + if (step > 5) { + Array cpu_out_array{ + exec->get_master(), + Array::view(exec, bucket.size, tmp_out)}; + auto begin = cpu_out_array.get_data(); + auto end = begin + bucket.size; + auto middle = begin + rank; + std::nth_element(begin, middle, end); + threshold = *middle; + return; + } + } + + // base case + auto out_ptr = reinterpret_cast(tmp1.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), dim3(1), + dim3(kernel::basecase_block_size), 0, 0, tmp22, + bucket.size, rank, out_ptr); + threshold = exec->copy_val_to_host(out_ptr); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp new file mode 100644 index 00000000000..3d00ce153ba --- /dev/null +++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp @@ -0,0 +1,185 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" +#include "hip/components/searching.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_spgeam_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *lu, + const matrix::Csr *a, + const matrix::Csr *l, + const matrix::Csr *u, + matrix::Csr *l_new, + matrix::Csr *u_new) +{ + auto num_rows = static_cast(lu->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + matrix::CsrBuilder u_new_builder(u_new); + auto lu_row_ptrs = lu->get_const_row_ptrs(); + auto lu_col_idxs = lu->get_const_col_idxs(); + auto lu_vals = lu->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + // count non-zeros per row + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_nnz), + dim3(num_blocks), dim3(default_block_size), 0, 0, + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, u_new_row_ptrs, num_rows); + + // build row ptrs + components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum(exec, u_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); + u_new_builder.get_value_array().resize_and_reset(u_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + // fill columns and values + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_init), + dim3(num_blocks), dim3(default_block_size), 0, 0, + lu_row_ptrs, lu_col_idxs, as_hip_type(lu_vals), + a_row_ptrs, a_col_idxs, as_hip_type(a_vals), l_row_ptrs, + l_col_idxs, as_hip_type(l_vals), u_row_ptrs, u_col_idxs, + as_hip_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_hip_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_hip_type(u_new_vals), num_rows); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *lu, + const matrix::Csr *a, + const matrix::Csr *l, + const matrix::Csr *u, + matrix::Csr *l_new, + matrix::Csr *u_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + lu->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, + u_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp new file mode 100644 index 00000000000..15fb33ec34e --- /dev/null +++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp @@ -0,0 +1,150 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/searching.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr auto default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/factorization/par_ilut_sweep_kernels.hpp.inc" + + +namespace { + + +template +void compute_l_u_factors(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo, + matrix::Csr *u, + const matrix::Coo *u_coo, + matrix::Csr *u_csc) +{ + auto total_nnz = static_cast(l->get_num_stored_elements() + + u->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::sweep), dim3(num_blocks), + dim3(default_block_size), 0, 0, a->get_const_row_ptrs(), + a->get_const_col_idxs(), as_hip_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_hip_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_hip_type(u->get_values()), u_csc->get_const_row_ptrs(), + u_csc->get_const_col_idxs(), as_hip_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, + compute_l_u_factors); + + +} // namespace + + +template +void compute_l_u_factors(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *l_coo, + matrix::Csr *u, + const matrix::Coo *u_coo, + matrix::Csr *u_csc) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + l->get_num_stored_elements() + u->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_l_u_factors( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, + u_csc); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp new file mode 100644 index 00000000000..c4907e899c7 --- /dev/null +++ b/hip/matrix/coo_kernels.hip.cpp @@ -0,0 +1,264 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/coo_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/fill_array.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/format_conversion.hip.hpp" +#include "hip/components/segment_scan.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +/** + * @brief The HIP namespace. + * + * @ingroup hip + */ +namespace hip { +/** + * @brief The Coordinate matrix format namespace. + * + * @ingroup coo + */ +namespace coo { + + +constexpr int default_block_size = 512; +constexpr int warps_in_block = 4; +constexpr int spmv_block_size = warps_in_block * config::warp_size; + + +#include "common/matrix/coo_kernels.hpp.inc" + + +template +void spmv(std::shared_ptr exec, + const matrix::Coo *a, + const matrix::Dense *b, matrix::Dense *c) +{ + components::fill_array(exec, c->get_values(), c->get_num_stored_elements(), + zero()); + + spmv2(exec, a, b, c); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV_KERNEL); + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Coo *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) +{ + dense::scale(exec, beta, c); + advanced_spmv2(exec, alpha, a, b, c); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); + + +template +void spmv2(std::shared_ptr exec, + const matrix::Coo *a, + const matrix::Dense *b, matrix::Dense *c) +{ + const auto nnz = a->get_num_stored_elements(); + const auto b_ncols = b->get_size()[1]; + const dim3 coo_block(config::warp_size, warps_in_block, 1); + const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); + + if (nwarps > 0) { + // TODO: b_ncols needs to be tuned. + if (b_ncols < 4) { + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); + int num_lines = ceildiv(nnz, nwarps * config::warp_size); + hipLaunchKernelGGL( + abstract_spmv, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, + num_lines, as_hip_type(a->get_const_values()), + a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()), + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); + } else { + int num_elems = + ceildiv(nnz, nwarps * config::warp_size) * config::warp_size; + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), + ceildiv(b_ncols, config::warp_size)); + hipLaunchKernelGGL( + abstract_spmm, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, + num_elems, as_hip_type(a->get_const_values()), + a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()), + b_ncols, as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); + + +template +void advanced_spmv2(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Coo *a, + const matrix::Dense *b, + matrix::Dense *c) +{ + const auto nnz = a->get_num_stored_elements(); + const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); + const dim3 coo_block(config::warp_size, warps_in_block, 1); + const auto b_ncols = b->get_size()[1]; + + if (nwarps > 0) { + // TODO: b_ncols needs to be tuned. + if (b_ncols < 4) { + int num_lines = ceildiv(nnz, nwarps * config::warp_size); + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); + hipLaunchKernelGGL( + abstract_spmv, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, + num_lines, as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_idxs()), + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); + } else { + int num_elems = + ceildiv(nnz, nwarps * config::warp_size) * config::warp_size; + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), + ceildiv(b_ncols, config::warp_size)); + hipLaunchKernelGGL( + abstract_spmm, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, + num_elems, as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_idxs()), b_ncols, + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); + + +template +void convert_row_idxs_to_ptrs(std::shared_ptr exec, + const IndexType *idxs, size_type num_nonzeros, + IndexType *ptrs, size_type length) +{ + const auto grid_dim = ceildiv(num_nonzeros, default_block_size); + + hipLaunchKernelGGL(kernel::convert_row_idxs_to_ptrs, dim3(grid_dim), + dim3(default_block_size), 0, 0, as_hip_type(idxs), + num_nonzeros, as_hip_type(ptrs), length); +} + + +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Coo *source, + matrix::Csr *result) +{ + auto num_rows = result->get_size()[0]; + + auto row_ptrs = result->get_row_ptrs(); + const auto nnz = result->get_num_stored_elements(); + + const auto source_row_idxs = source->get_const_row_idxs(); + + convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, + num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); + + +template +void convert_to_dense(std::shared_ptr exec, + const matrix::Coo *source, + matrix::Dense *result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto stride = result->get_stride(); + + const auto nnz = source->get_num_stored_elements(); + + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); + const dim3 init_grid_dim(ceildiv(stride, block_size.x), + ceildiv(num_rows, block_size.y), 1); + hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), + dim3(block_size), 0, 0, num_rows, num_cols, stride, + as_hip_type(result->get_values())); + + const auto grid_dim = ceildiv(nnz, default_block_size); + hipLaunchKernelGGL(kernel::fill_in_dense, dim3(grid_dim), + dim3(default_block_size), 0, 0, nnz, + as_hip_type(source->get_const_row_idxs()), + as_hip_type(source->get_const_col_idxs()), + as_hip_type(source->get_const_values()), stride, + as_hip_type(result->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); + + +} // namespace coo +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp new file mode 100644 index 00000000000..cf49d441032 --- /dev/null +++ b/hip/matrix/csr_kernels.hip.cpp @@ -0,0 +1,1174 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/csr_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/components/fill_array.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/segment_scan.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Compressed sparse row matrix format namespace. + * + * @ingroup csr + */ +namespace csr { + + +constexpr int default_block_size = 512; +constexpr int warps_in_block = 4; +constexpr int spmv_block_size = warps_in_block * config::warp_size; +constexpr int wsize = config::warp_size; +constexpr int classical_overweight = 32; + + +/** + * A compile-time list of the number items per threads for which spmv kernel + * should be compiled. + */ +using compiled_kernels = syn::value_list; + +using classical_kernels = + syn::value_list; + +using spgeam_kernels = + syn::value_list; + + +#include "common/matrix/csr_kernels.hpp.inc" + + +namespace host_kernel { + + +template +void merge_path_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) +{ + const IndexType total = a->get_size()[0] + a->get_num_stored_elements(); + const IndexType grid_num = + ceildiv(total, spmv_block_size * items_per_thread); + const dim3 grid(grid_num); + const dim3 block(spmv_block_size); + Array row_out(exec, grid_num); + Array val_out(exec, grid_num); + + for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) { + if (alpha == nullptr && beta == nullptr) { + const auto b_vals = b->get_const_values() + column_id; + auto c_vals = c->get_values() + column_id; + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::abstract_merge_path_spmv), + dim3(grid), dim3(block), 0, 0, + static_cast(a->get_size()[0]), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), + as_hip_type(a->get_const_srow()), as_hip_type(b_vals), + b->get_stride(), as_hip_type(c_vals), c->get_stride(), + as_hip_type(row_out.get_data()), + as_hip_type(val_out.get_data())); + hipLaunchKernelGGL(kernel::abstract_reduce, dim3(1), + dim3(spmv_block_size), 0, 0, grid_num, + as_hip_type(val_out.get_data()), + as_hip_type(row_out.get_data()), + as_hip_type(c_vals), c->get_stride()); + + } else if (alpha != nullptr && beta != nullptr) { + const auto b_vals = b->get_const_values() + column_id; + auto c_vals = c->get_values() + column_id; + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::abstract_merge_path_spmv), + dim3(grid), dim3(block), 0, 0, + static_cast(a->get_size()[0]), + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), + as_hip_type(a->get_const_srow()), as_hip_type(b_vals), + b->get_stride(), as_hip_type(beta->get_const_values()), + as_hip_type(c_vals), c->get_stride(), + as_hip_type(row_out.get_data()), + as_hip_type(val_out.get_data())); + hipLaunchKernelGGL(kernel::abstract_reduce, dim3(1), + dim3(spmv_block_size), 0, 0, grid_num, + as_hip_type(val_out.get_data()), + as_hip_type(row_out.get_data()), + as_hip_type(alpha->get_const_values()), + as_hip_type(c_vals), c->get_stride()); + } else { + GKO_KERNEL_NOT_FOUND; + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); + + +template +int compute_items_per_thread(std::shared_ptr exec) +{ +#if GINKGO_HIP_PLATFORM_NVCC + + + const int version = + (exec->get_major_version() << 4) + exec->get_minor_version(); + // The num_item is decided to make the occupancy 100% + // TODO: Extend this list when new GPU is released + // Tune this parameter + // 128 threads/block the number of items per threads + // 3.0 3.5: 6 + // 3.7: 14 + // 5.0, 5.3, 6.0, 6.2: 8 + // 5.2, 6.1, 7.0: 12 + int num_item = 6; + switch (version) { + case 0x50: + case 0x53: + case 0x60: + case 0x62: + num_item = 8; + break; + case 0x52: + case 0x61: + case 0x70: + num_item = 12; + break; + case 0x37: + num_item = 14; + } + + +#else + + + // HIP uses the minimal num_item to make the code work correctly. + // TODO: this parameter should be tuned. + int num_item = 6; + + +#endif // GINKGO_HIP_PLATFORM_NVCC + + + // Ensure that the following is satisfied: + // sizeof(IndexType) + sizeof(ValueType) + // <= items_per_thread * sizeof(IndexType) + constexpr int minimal_num = + ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType)); + int items_per_thread = num_item * 4 / sizeof(IndexType); + return std::max(minimal_num, items_per_thread); +} + + +template +void classical_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) +{ + const auto nwarps = exec->get_num_warps_per_sm() * + exec->get_num_multiprocessor() * classical_overweight; + const auto gridx = + std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + int64(nwarps / warps_in_block)); + const dim3 grid(gridx, b->get_size()[1]); + const dim3 block(spmv_block_size); + + if (alpha == nullptr && beta == nullptr) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::abstract_classical_spmv), + dim3(grid), dim3(block), 0, 0, a->get_size()[0], + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); + + } else if (alpha != nullptr && beta != nullptr) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::abstract_classical_spmv), + dim3(grid), dim3(block), 0, 0, a->get_size()[0], + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()), + c->get_stride()); + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); + + +} // namespace host_kernel + + +template +void spmv(std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Dense *b, matrix::Dense *c) +{ + if (a->get_strategy()->get_name() == "load_balance") { + components::fill_array(exec, c->get_values(), + c->get_num_stored_elements(), zero()); + const IndexType nwarps = a->get_num_srow_elements(); + if (nwarps > 0) { + const dim3 csr_block(config::warp_size, warps_in_block, 1); + const dim3 csr_grid(ceildiv(nwarps, warps_in_block), + b->get_size()[1]); + hipLaunchKernelGGL( + kernel::abstract_spmv, dim3(csr_grid), dim3(csr_block), 0, 0, + nwarps, static_cast(a->get_size()[0]), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), + as_hip_type(a->get_const_srow()), + as_hip_type(b->get_const_values()), + as_hip_type(b->get_stride()), as_hip_type(c->get_values()), + as_hip_type(c->get_stride())); + } else { + GKO_NOT_SUPPORTED(nwarps); + } + } else if (a->get_strategy()->get_name() == "merge_path") { + int items_per_thread = + host_kernel::compute_items_per_thread(exec); + host_kernel::select_merge_path_spmv( + compiled_kernels(), + [&items_per_thread](int compiled_info) { + return items_per_thread == compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); + } else if (a->get_strategy()->get_name() == "classical") { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>(a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + GKO_NOT_SUPPORTED(a->get_strategy()); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); + } else if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { + if (hipsparse::is_supported::value) { + // TODO: add implementation for int64 and multiple RHS + auto handle = exec->get_hipsparse_handle(); + auto descr = hipsparse::create_mat_descr(); + { + hipsparse::pointer_mode_guard pm_guard(handle); + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); + auto alpha = one(); + auto beta = zero(); + if (b->get_stride() != 1 || c->get_stride() != 1) { + GKO_NOT_IMPLEMENTED; + } + hipsparse::spmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), &alpha, descr, + a->get_const_values(), row_ptrs, col_idxs, + b->get_const_values(), &beta, c->get_values()); + } + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL); + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) +{ + if (a->get_strategy()->get_name() == "load_balance") { + dense::scale(exec, beta, c); + + const IndexType nwarps = a->get_num_srow_elements(); + + if (nwarps > 0) { + const dim3 csr_block(config::warp_size, warps_in_block, 1); + const dim3 csr_grid(ceildiv(nwarps, warps_in_block), + b->get_size()[1]); + hipLaunchKernelGGL( + kernel::abstract_spmv, dim3(csr_grid), dim3(csr_block), 0, 0, + nwarps, static_cast(a->get_size()[0]), + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), + as_hip_type(a->get_const_srow()), + as_hip_type(b->get_const_values()), + as_hip_type(b->get_stride()), as_hip_type(c->get_values()), + as_hip_type(c->get_stride())); + } else { + GKO_NOT_SUPPORTED(nwarps); + } + } else if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { + if (hipsparse::is_supported::value) { + // TODO: add implementation for int64 and multiple RHS + auto descr = hipsparse::create_mat_descr(); + + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); + + if (b->get_stride() != 1 || c->get_stride() != 1) + GKO_NOT_IMPLEMENTED; + + hipsparse::spmv(exec->get_hipsparse_handle(), + HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], + a->get_size()[1], a->get_num_stored_elements(), + alpha->get_const_values(), descr, + a->get_const_values(), row_ptrs, col_idxs, + b->get_const_values(), beta->get_const_values(), + c->get_values()); + + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } + } else if (a->get_strategy()->get_name() == "classical") { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>(a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + GKO_NOT_SUPPORTED(a->get_strategy()); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, + beta); + } else if (a->get_strategy()->get_name() == "merge_path") { + int items_per_thread = + host_kernel::compute_items_per_thread(exec); + host_kernel::select_merge_path_spmv( + compiled_kernels(), + [&items_per_thread](int compiled_info) { + return items_per_thread == compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, + beta); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); + + +template +void spgemm(std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Csr *b, + matrix::Csr *c) +{ + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + hipsparse::pointer_mode_guard pm_guard(handle); + auto a_descr = hipsparse::create_mat_descr(); + auto b_descr = hipsparse::create_mat_descr(); + auto c_descr = hipsparse::create_mat_descr(); + auto d_descr = hipsparse::create_mat_descr(); + auto info = hipsparse::create_spgemm_info(); + + auto alpha = one(); + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = static_cast(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto zero_nnz = IndexType{}; + auto m = static_cast(a->get_size()[0]); + auto n = static_cast(b->get_size()[1]); + auto k = static_cast(a->get_size()[1]); + auto c_row_ptrs = c->get_row_ptrs(); + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + + // allocate buffer + size_type buffer_size{}; + hipsparse::spgemm_buffer_size( + handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, + zero_nnz, null_index, null_index, info, buffer_size); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + + // count nnz + IndexType c_nnz{}; + hipsparse::spgemm_nnz( + handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, + b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index, + null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer); + + // accumulate non-zeros + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + hipsparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, + null_value, null_index, null_index, c_descr, c_vals, + c_row_ptrs, c_col_idxs, info, buffer); + + hipsparse::destroy_spgemm_info(info); + hipsparse::destroy(d_descr); + hipsparse::destroy(c_descr); + hipsparse::destroy(b_descr); + hipsparse::destroy(a_descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); + + +namespace { + + +template +void spgeam(syn::value_list, + std::shared_ptr exec, const ValueType *alpha, + const IndexType *a_row_ptrs, const IndexType *a_col_idxs, + const ValueType *a_vals, const ValueType *beta, + const IndexType *b_row_ptrs, const IndexType *b_col_idxs, + const ValueType *b_vals, matrix::Csr *c) +{ + auto m = static_cast(c->get_size()[0]); + auto c_row_ptrs = c->get_row_ptrs(); + // count nnz for alpha * A + beta * B + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(m, subwarps_per_block); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam_nnz), + dim3(num_blocks), dim3(default_block_size), 0, 0, + a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, + c_row_ptrs); + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, m + 1); + + // accumulate non-zeros for alpha * A + beta * B + matrix::CsrBuilder c_builder{c}; + auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); + c_builder.get_col_idx_array().resize_and_reset(c_nnz); + c_builder.get_value_array().resize_and_reset(c_nnz); + auto c_col_idxs = c->get_col_idxs(); + auto c_vals = c->get_values(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam), + dim3(num_blocks), dim3(default_block_size), 0, 0, + as_hip_type(alpha), a_row_ptrs, a_col_idxs, + as_hip_type(a_vals), as_hip_type(beta), b_row_ptrs, + b_col_idxs, as_hip_type(b_vals), m, c_row_ptrs, + c_col_idxs, as_hip_type(c_vals)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); + + +} // namespace + + +template +void advanced_spgemm(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Csr *b, + const matrix::Dense *beta, + const matrix::Csr *d, + matrix::Csr *c) +{ + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + hipsparse::pointer_mode_guard pm_guard(handle); + auto a_descr = hipsparse::create_mat_descr(); + auto b_descr = hipsparse::create_mat_descr(); + auto c_descr = hipsparse::create_mat_descr(); + auto d_descr = hipsparse::create_mat_descr(); + auto info = hipsparse::create_spgemm_info(); + + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = static_cast(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto d_vals = d->get_const_values(); + auto d_row_ptrs = d->get_const_row_ptrs(); + auto d_col_idxs = d->get_const_col_idxs(); + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto one_value = one(); + auto m = static_cast(a->get_size()[0]); + auto n = static_cast(b->get_size()[1]); + auto k = static_cast(a->get_size()[1]); + + // allocate buffer + size_type buffer_size{}; + hipsparse::spgemm_buffer_size( + handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, + IndexType{}, null_index, null_index, info, buffer_size); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + + // count nnz + Array c_tmp_row_ptrs_array(exec, m + 1); + auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data(); + IndexType c_nnz{}; + hipsparse::spgemm_nnz( + handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, + b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index, + null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer); + + // accumulate non-zeros for A * B + Array c_tmp_col_idxs_array(exec, c_nnz); + Array c_tmp_vals_array(exec, c_nnz); + auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data(); + auto c_tmp_vals = c_tmp_vals_array.get_data(); + hipsparse::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, + IndexType{}, null_value, null_index, null_index, + c_descr, c_tmp_vals, c_tmp_row_ptrs, c_tmp_col_idxs, + info, buffer); + + // destroy hipsparse context + hipsparse::destroy_spgemm_info(info); + hipsparse::destroy(d_descr); + hipsparse::destroy(c_descr); + hipsparse::destroy(b_descr); + hipsparse::destroy(a_descr); + + auto total_nnz = c_nnz + d->get_num_stored_elements(); + auto nnz_per_row = total_nnz / m; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs, + c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs, + d_vals, c); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); + + +template +void spgeam(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Dense *beta, + const matrix::Csr *b, + matrix::Csr *c) +{ + auto total_nnz = + a->get_num_stored_elements() + b->get_num_stored_elements(); + auto nnz_per_row = total_nnz / a->get_size()[0]; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), a->get_const_row_ptrs(), + a->get_const_col_idxs(), a->get_const_values(), + beta->get_const_values(), b->get_const_row_ptrs(), + b->get_const_col_idxs(), b->get_const_values(), c); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); + + +template +void convert_row_ptrs_to_idxs(std::shared_ptr exec, + const IndexType *ptrs, size_type num_rows, + IndexType *idxs) +{ + const auto grid_dim = ceildiv(num_rows, default_block_size); + + hipLaunchKernelGGL(kernel::convert_row_ptrs_to_idxs, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + as_hip_type(ptrs), as_hip_type(idxs)); +} + + +template +void convert_to_coo(std::shared_ptr exec, + const matrix::Csr *source, + matrix::Coo *result) +{ + auto num_rows = result->get_size()[0]; + + auto row_idxs = result->get_row_idxs(); + const auto source_row_ptrs = source->get_const_row_ptrs(); + + convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); + + +template +void convert_to_dense(std::shared_ptr exec, + const matrix::Csr *source, + matrix::Dense *result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto stride = result->get_stride(); + const auto row_ptrs = source->get_const_row_ptrs(); + const auto col_idxs = source->get_const_col_idxs(); + const auto vals = source->get_const_values(); + + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); + const dim3 init_grid_dim(ceildiv(stride, block_size.x), + ceildiv(num_rows, block_size.y), 1); + hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), + dim3(block_size), 0, 0, num_rows, num_cols, stride, + as_hip_type(result->get_values())); + + auto grid_dim = ceildiv(num_rows, default_block_size); + hipLaunchKernelGGL( + kernel::fill_in_dense, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, as_hip_type(row_ptrs), as_hip_type(col_idxs), + as_hip_type(vals), stride, as_hip_type(result->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); + + +template +void convert_to_sellp(std::shared_ptr exec, + const matrix::Csr *source, + matrix::Sellp *result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + + auto result_values = result->get_values(); + auto result_col_idxs = result->get_col_idxs(); + auto slice_lengths = result->get_slice_lengths(); + auto slice_sets = result->get_slice_sets(); + + const auto slice_size = (result->get_slice_size() == 0) + ? matrix::default_slice_size + : result->get_slice_size(); + const auto stride_factor = (result->get_stride_factor() == 0) + ? matrix::default_stride_factor + : result->get_stride_factor(); + const int slice_num = ceildiv(num_rows, slice_size); + + const auto source_values = source->get_const_values(); + const auto source_row_ptrs = source->get_const_row_ptrs(); + const auto source_col_idxs = source->get_const_col_idxs(); + + auto nnz_per_row = Array(exec, num_rows); + auto grid_dim = ceildiv(num_rows, default_block_size); + + hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + as_hip_type(source_row_ptrs), + as_hip_type(nnz_per_row.get_data())); + + grid_dim = slice_num; + + hipLaunchKernelGGL(kernel::calculate_slice_lengths, dim3(grid_dim), + dim3(config::warp_size), 0, 0, num_rows, slice_size, + stride_factor, as_hip_type(nnz_per_row.get_const_data()), + as_hip_type(slice_lengths), as_hip_type(slice_sets)); + + components::prefix_sum(exec, slice_sets, slice_num + 1); + + grid_dim = ceildiv(num_rows, default_block_size); + hipLaunchKernelGGL(kernel::fill_in_sellp, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, slice_size, + as_hip_type(source_values), as_hip_type(source_row_ptrs), + as_hip_type(source_col_idxs), as_hip_type(slice_lengths), + as_hip_type(slice_sets), as_hip_type(result_col_idxs), + as_hip_type(result_values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); + + +template +void convert_to_ell(std::shared_ptr exec, + const matrix::Csr *source, + matrix::Ell *result) +{ + const auto source_values = source->get_const_values(); + const auto source_row_ptrs = source->get_const_row_ptrs(); + const auto source_col_idxs = source->get_const_col_idxs(); + + auto result_values = result->get_values(); + auto result_col_idxs = result->get_col_idxs(); + const auto stride = result->get_stride(); + const auto max_nnz_per_row = result->get_num_stored_elements_per_row(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + + const auto init_grid_dim = + ceildiv(max_nnz_per_row * num_rows, default_block_size); + + hipLaunchKernelGGL(kernel::initialize_zero_ell, dim3(init_grid_dim), + dim3(default_block_size), 0, 0, max_nnz_per_row, stride, + as_hip_type(result_values), + as_hip_type(result_col_idxs)); + + const auto grid_dim = + ceildiv(num_rows * config::warp_size, default_block_size); + + hipLaunchKernelGGL(kernel::fill_in_ell, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, stride, + as_hip_type(source_values), as_hip_type(source_row_ptrs), + as_hip_type(source_col_idxs), as_hip_type(result_values), + as_hip_type(result_col_idxs)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); + + +template +void calculate_total_cols(std::shared_ptr exec, + const matrix::Csr *source, + size_type *result, size_type stride_factor, + size_type slice_size) +{ + const auto num_rows = source->get_size()[0]; + const auto slice_num = ceildiv(num_rows, slice_size); + const auto row_ptrs = source->get_const_row_ptrs(); + + auto nnz_per_row = Array(exec, num_rows); + auto grid_dim = ceildiv(num_rows, default_block_size); + + hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + as_hip_type(row_ptrs), + as_hip_type(nnz_per_row.get_data())); + + grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); + auto max_nnz_per_slice = Array(exec, slice_num); + + hipLaunchKernelGGL(kernel::reduce_max_nnz_per_slice, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, slice_size, + stride_factor, as_hip_type(nnz_per_row.get_const_data()), + as_hip_type(max_nnz_per_slice.get_data())); + + grid_dim = ceildiv(slice_num, default_block_size); + auto block_results = Array(exec, grid_dim); + + hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(grid_dim), + dim3(default_block_size), 0, 0, slice_num, + as_hip_type(max_nnz_per_slice.get_const_data()), + as_hip_type(block_results.get_data())); + + auto d_result = Array(exec, 1); + + hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(1), + dim3(default_block_size), 0, 0, grid_dim, + as_hip_type(block_results.get_const_data()), + as_hip_type(d_result.get_data())); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::Csr *orig, + matrix::Csr *trans) +{ + if (hipsparse::is_supported::value) { + hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; + hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; + + hipsparse::transpose( + exec->get_hipsparse_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Csr *orig, + matrix::Csr *trans) +{ + if (hipsparse::is_supported::value) { + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1); + + hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; + hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; + + hipsparse::transpose( + exec->get_hipsparse_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase); + + hipLaunchKernelGGL(conjugate_kernel, dim3(grid_size), dim3(block_size), + 0, 0, trans->get_num_stored_elements(), + as_hip_type(trans->get_values())); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); + + +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + + +template +void calculate_max_nnz_per_row(std::shared_ptr exec, + const matrix::Csr *source, + size_type *result) +{ + const auto num_rows = source->get_size()[0]; + + auto nnz_per_row = Array(exec, num_rows); + auto block_results = Array(exec, default_block_size); + auto d_result = Array(exec, 1); + + const auto grid_dim = ceildiv(num_rows, default_block_size); + hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + as_hip_type(source->get_const_row_ptrs()), + as_hip_type(nnz_per_row.get_data())); + + const auto n = ceildiv(num_rows, default_block_size); + const auto reduce_dim = n <= default_block_size ? n : default_block_size; + hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(reduce_dim), + dim3(default_block_size), 0, 0, num_rows, + as_hip_type(nnz_per_row.get_const_data()), + as_hip_type(block_results.get_data())); + + hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(1), + dim3(default_block_size), 0, 0, reduce_dim, + as_hip_type(block_results.get_const_data()), + as_hip_type(d_result.get_data())); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void convert_to_hybrid(std::shared_ptr exec, + const matrix::Csr *source, + matrix::Hybrid *result) +{ + auto ell_val = result->get_ell_values(); + auto ell_col = result->get_ell_col_idxs(); + auto coo_val = result->get_coo_values(); + auto coo_col = result->get_coo_col_idxs(); + auto coo_row = result->get_coo_row_idxs(); + const auto stride = result->get_ell_stride(); + const auto max_nnz_per_row = result->get_ell_num_stored_elements_per_row(); + const auto num_rows = result->get_size()[0]; + const auto coo_num_stored_elements = result->get_coo_num_stored_elements(); + auto grid_dim = ceildiv(max_nnz_per_row * num_rows, default_block_size); + + hipLaunchKernelGGL(kernel::initialize_zero_ell, dim3(grid_dim), + dim3(default_block_size), 0, 0, max_nnz_per_row, stride, + as_hip_type(ell_val), as_hip_type(ell_col)); + + grid_dim = ceildiv(num_rows, default_block_size); + auto coo_offset = Array(exec, num_rows); + hipLaunchKernelGGL(kernel::calculate_hybrid_coo_row_nnz, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + max_nnz_per_row, + as_hip_type(source->get_const_row_ptrs()), + as_hip_type(coo_offset.get_data())); + + components::prefix_sum(exec, coo_offset.get_data(), num_rows); + + grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); + hipLaunchKernelGGL(kernel::fill_in_hybrid, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, stride, + max_nnz_per_row, as_hip_type(source->get_const_values()), + as_hip_type(source->get_const_row_ptrs()), + as_hip_type(source->get_const_col_idxs()), + as_hip_type(coo_offset.get_const_data()), + as_hip_type(ell_val), as_hip_type(ell_col), + as_hip_type(coo_val), as_hip_type(coo_col), + as_hip_type(coo_row)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); + + +template +void calculate_nonzeros_per_row(std::shared_ptr exec, + const matrix::Csr *source, + Array *result) +{ + const auto num_rows = source->get_size()[0]; + auto row_ptrs = source->get_const_row_ptrs(); + auto grid_dim = ceildiv(num_rows, default_block_size); + + hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + as_hip_type(row_ptrs), as_hip_type(result->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +template +void sort_by_column_index(std::shared_ptr exec, + matrix::Csr *to_sort) +{ + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + auto descr = hipsparse::create_mat_descr(); + auto m = IndexType(to_sort->get_size()[0]); + auto n = IndexType(to_sort->get_size()[1]); + auto nnz = IndexType(to_sort->get_num_stored_elements()); + auto row_ptrs = to_sort->get_const_row_ptrs(); + auto col_idxs = to_sort->get_col_idxs(); + auto vals = to_sort->get_values(); + + // copy values + Array tmp_vals_array(exec, nnz); + exec->copy(nnz, vals, tmp_vals_array.get_data()); + auto tmp_vals = tmp_vals_array.get_const_data(); + + // init identity permutation + Array permutation_array(exec, nnz); + auto permutation = permutation_array.get_data(); + hipsparse::create_identity_permutation(handle, nnz, permutation); + + // allocate buffer + size_type buffer_size{}; + hipsparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + buffer_size); + Array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + + // sort column indices + hipsparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + permutation, buffer); + + // sort values + hipsparse::gather(handle, nnz, tmp_vals, vals, permutation); + + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); + + +template +void is_sorted_by_column_index( + std::shared_ptr exec, + const matrix::Csr *to_check, bool *is_sorted) +{ + *is_sorted = true; + auto cpu_array = Array::view(exec->get_master(), 1, is_sorted); + auto gpu_array = Array{exec, cpu_array}; + auto block_size = default_block_size; + auto num_rows = static_cast(to_check->get_size()[0]); + auto num_blocks = ceildiv(num_rows, block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::check_unsorted), dim3(num_blocks), + dim3(block_size), 0, 0, to_check->get_const_row_ptrs(), + to_check->get_const_col_idxs(), num_rows, gpu_array.get_data()); + cpu_array = gpu_array; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); + + +} // namespace csr +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp new file mode 100644 index 00000000000..4d5eb1da4b3 --- /dev/null +++ b/hip/matrix/dense_kernels.hip.cpp @@ -0,0 +1,690 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +constexpr auto default_block_size = 512; + + +#include "common/matrix/dense_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const matrix::Dense *a, + const matrix::Dense *b, + matrix::Dense *c) +{ + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + { + hipblas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Dense *a, const matrix::Dense *b, + const matrix::Dense *beta, matrix::Dense *c) +{ + if (hipblas::is_supported::value) { + hipblas::gemm(exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), a->get_stride(), + beta->get_const_values(), c->get_values(), + c->get_stride()); + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); + + +template +void scale(std::shared_ptr exec, + const matrix::Dense *alpha, matrix::Dense *x) +{ + if (hipblas::is_supported::value && x->get_size()[1] == 1) { + hipblas::scal(exec->get_hipblas_handle(), x->get_size()[0], + alpha->get_const_values(), x->get_values(), + x->get_stride()); + } else { + // TODO: tune this parameter + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(x->get_size()[0] * x->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::scale), dim3(grid_dim), + dim3(block_dim), 0, 0, x->get_size()[0], x->get_size()[1], + alpha->get_size()[1], as_hip_type(alpha->get_const_values()), + as_hip_type(x->get_values()), x->get_stride()); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Dense *x, matrix::Dense *y) +{ + if (hipblas::is_supported::value && x->get_size()[1] == 1) { + hipblas::axpy(exec->get_hipblas_handle(), x->get_size()[0], + alpha->get_const_values(), x->get_const_values(), + x->get_stride(), y->get_values(), y->get_stride()); + } else { + // TODO: tune this parameter + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(x->get_size()[0] * x->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::add_scaled), dim3(grid_dim), + dim3(block_dim), 0, 0, x->get_size()[0], x->get_size()[1], + alpha->get_size()[1], as_hip_type(alpha->get_const_values()), + as_hip_type(x->get_const_values()), x->get_stride(), + as_hip_type(y->get_values()), y->get_stride()); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const matrix::Dense *x, + const matrix::Dense *y, + matrix::Dense *result) +{ + if (hipblas::is_supported::value) { + // TODO: write a custom kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + hipblas::dot(exec->get_hipblas_handle(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); + } + } else { + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + constexpr auto block_size = 1024; + + constexpr auto work_per_block = work_per_thread * block_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::compute_partial_dot), + dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], + as_hip_type(x->get_const_values() + col), x->get_stride(), + as_hip_type(y->get_const_values() + col), y->get_stride(), + as_hip_type(work.get_data())); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::finalize_dot_computation), + dim3(1), dim3(block_dim), 0, 0, grid_dim.x, + as_hip_type(work.get_const_data()), + as_hip_type(result->get_values() + col)); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::Dense *x, + matrix::Dense> *result) +{ + if (hipblas::is_supported::value) { + for (size_type col = 0; col < x->get_size()[1]; ++col) { + hipblas::norm2(exec->get_hipblas_handle(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + result->get_values() + col); + } + } else { + using norm_type = remove_complex; + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + constexpr auto block_size = 1024; + + constexpr auto work_per_block = work_per_thread * block_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::compute_partial_norm2), + dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], + as_hip_type(x->get_const_values() + col), x->get_stride(), + as_hip_type(work.get_data())); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::finalize_norm2_computation), + dim3(1), dim3(block_dim), 0, 0, grid_dim.x, + as_hip_type(work.get_const_data()), + as_hip_type(result->get_values() + col)); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); + + +template +void convert_to_coo(std::shared_ptr exec, + const matrix::Dense *source, + matrix::Coo *result) +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_idxs = result->get_row_idxs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + auto stride = source->get_stride(); + + auto nnz_prefix_sum = Array(exec, num_rows); + calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); + + const size_type grid_dim = ceildiv(num_rows, default_block_size); + auto add_values = Array(exec, grid_dim); + + components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); + + hipLaunchKernelGGL(kernel::fill_in_coo, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, num_cols, + stride, as_hip_type(nnz_prefix_sum.get_const_data()), + as_hip_type(source->get_const_values()), + as_hip_type(row_idxs), as_hip_type(col_idxs), + as_hip_type(values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); + + +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Dense *source, + matrix::Csr *result) +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + auto stride = source->get_stride(); + + const auto rows_per_block = ceildiv(default_block_size, config::warp_size); + const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); + + hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim_nnz), + dim3(default_block_size), 0, 0, num_rows, num_cols, + stride, as_hip_type(source->get_const_values()), + as_hip_type(row_ptrs)); + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + + size_type grid_dim = ceildiv(num_rows, default_block_size); + + hipLaunchKernelGGL( + kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, num_cols, stride, as_hip_type(source->get_const_values()), + as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); + + +template +void convert_to_ell(std::shared_ptr exec, + const matrix::Dense *source, + matrix::Ell *result) +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + auto max_nnz_per_row = result->get_num_stored_elements_per_row(); + + auto col_ptrs = result->get_col_idxs(); + auto values = result->get_values(); + + auto source_stride = source->get_stride(); + auto result_stride = result->get_stride(); + + auto grid_dim = ceildiv(result_stride, default_block_size); + hipLaunchKernelGGL(kernel::fill_in_ell, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, num_cols, + source_stride, as_hip_type(source->get_const_values()), + max_nnz_per_row, result_stride, as_hip_type(col_ptrs), + as_hip_type(values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); + + +template +void convert_to_hybrid(std::shared_ptr exec, + const matrix::Dense *source, + matrix::Hybrid *result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); + + +template +void convert_to_sellp(std::shared_ptr exec, + const matrix::Dense *source, + matrix::Sellp *result) +{ + const auto stride = source->get_stride(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + + auto vals = result->get_values(); + auto col_idxs = result->get_col_idxs(); + auto slice_lengths = result->get_slice_lengths(); + auto slice_sets = result->get_slice_sets(); + + const auto slice_size = (result->get_slice_size() == 0) + ? matrix::default_slice_size + : result->get_slice_size(); + const auto stride_factor = (result->get_stride_factor() == 0) + ? matrix::default_stride_factor + : result->get_stride_factor(); + const int slice_num = ceildiv(num_rows, slice_size); + + auto nnz_per_row = Array(exec, num_rows); + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + auto grid_dim = slice_num; + + hipLaunchKernelGGL(kernel::calculate_slice_lengths, dim3(grid_dim), + dim3(config::warp_size), 0, 0, num_rows, slice_size, + slice_num, stride_factor, + as_hip_type(nnz_per_row.get_const_data()), + as_hip_type(slice_lengths), as_hip_type(slice_sets)); + + components::prefix_sum(exec, slice_sets, slice_num + 1); + + grid_dim = ceildiv(num_rows, default_block_size); + hipLaunchKernelGGL( + kernel::fill_in_sellp, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, num_cols, slice_size, stride, + as_hip_type(source->get_const_values()), as_hip_type(slice_lengths), + as_hip_type(slice_sets), as_hip_type(col_idxs), as_hip_type(vals)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); + + +template +void convert_to_sparsity_csr(std::shared_ptr exec, + const matrix::Dense *source, + matrix::SparsityCsr *result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::Dense *source, size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); + + +template +void calculate_max_nnz_per_row(std::shared_ptr exec, + const matrix::Dense *source, + size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + const auto n = ceildiv(num_rows, default_block_size); + const size_type grid_dim = + (n <= default_block_size) ? n : default_block_size; + + auto block_results = Array(exec, grid_dim); + + hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(grid_dim), + dim3(default_block_size), + default_block_size * sizeof(size_type), 0, num_rows, + as_hip_type(nnz_per_row.get_const_data()), + as_hip_type(block_results.get_data())); + + auto d_result = Array(exec, 1); + + hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(1), + dim3(default_block_size), + default_block_size * sizeof(size_type), 0, grid_dim, + as_hip_type(block_results.get_const_data()), + as_hip_type(d_result.get_data())); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void calculate_nonzeros_per_row(std::shared_ptr exec, + const matrix::Dense *source, + Array *result) +{ + const dim3 block_size(default_block_size, 1, 1); + auto rows_per_block = ceildiv(default_block_size, config::warp_size); + const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); + const dim3 grid_size(grid_x, 1, 1); + hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_size), + dim3(block_size), 0, 0, source->get_size()[0], + source->get_size()[1], source->get_stride(), + as_hip_type(source->get_const_values()), + as_hip_type(result->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +template +void calculate_total_cols(std::shared_ptr exec, + const matrix::Dense *source, + size_type *result, size_type stride_factor, + size_type slice_size) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto slice_num = ceildiv(num_rows, slice_size); + + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + auto max_nnz_per_slice = Array(exec, slice_num); + + auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); + + hipLaunchKernelGGL(kernel::reduce_max_nnz_per_slice, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, slice_size, + stride_factor, as_hip_type(nnz_per_row.get_const_data()), + as_hip_type(max_nnz_per_slice.get_data())); + + grid_dim = ceildiv(slice_num, default_block_size); + auto block_results = Array(exec, grid_dim); + + hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(grid_dim), + dim3(default_block_size), + default_block_size * sizeof(size_type), 0, slice_num, + as_hip_type(max_nnz_per_slice.get_const_data()), + as_hip_type(block_results.get_data())); + + auto d_result = Array(exec, 1); + + hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(1), + dim3(default_block_size), + default_block_size * sizeof(size_type), 0, grid_dim, + as_hip_type(block_results.get_const_data()), + as_hip_type(d_result.get_data())); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::Dense *orig, + matrix::Dense *trans) +{ + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + { + hipblas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, + orig->get_size()[0], orig->get_size()[1], &alpha, + orig->get_const_values(), orig->get_stride(), &beta, + orig->get_const_values(), trans->get_size()[1], + trans->get_values(), trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +}; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Dense *orig, + matrix::Dense *trans) +{ + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + { + hipblas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N, + orig->get_size()[0], orig->get_size()[1], &alpha, + orig->get_const_values(), orig->get_stride(), &beta, + orig->get_const_values(), trans->get_size()[1], + trans->get_values(), trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL); + + +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + hipLaunchKernelGGL( + kernel::row_permute, dim3(grid_dim), dim3(block_dim), 0, 0, + orig->get_size()[0], orig->get_size()[1], + as_hip_type(permutation_indices->get_const_data()), + as_hip_type(orig->get_const_values()), orig->get_stride(), + as_hip_type(row_permuted->get_values()), row_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + hipLaunchKernelGGL( + kernel::column_permute, dim3(grid_dim), dim3(block_dim), 0, + 0, orig->get_size()[0], orig->get_size()[1], + as_hip_type(permutation_indices->get_const_data()), + as_hip_type(orig->get_const_values()), orig->get_stride(), + as_hip_type(column_permuted->get_values()), + column_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + hipLaunchKernelGGL( + kernel::inverse_row_permute, dim3(grid_dim), + dim3(block_dim), 0, 0, orig->get_size()[0], orig->get_size()[1], + as_hip_type(permutation_indices->get_const_data()), + as_hip_type(orig->get_const_values()), orig->get_stride(), + as_hip_type(row_permuted->get_values()), row_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + constexpr auto block_size = default_block_size; + const dim3 grid_dim = + ceildiv(orig->get_size()[0] * orig->get_size()[1], block_size); + const dim3 block_dim{config::warp_size, 1, block_size / config::warp_size}; + hipLaunchKernelGGL( + kernel::inverse_column_permute, dim3(grid_dim), + dim3(block_dim), 0, 0, orig->get_size()[0], orig->get_size()[1], + as_hip_type(permutation_indices->get_const_data()), + as_hip_type(orig->get_const_values()), orig->get_stride(), + as_hip_type(column_permuted->get_values()), + column_permuted->get_stride()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL); + + +} // namespace dense +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp new file mode 100644 index 00000000000..c29da194aa6 --- /dev/null +++ b/hip/matrix/ell_kernels.hip.cpp @@ -0,0 +1,378 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/ell_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/fill_array.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/format_conversion.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The ELL matrix format namespace. + * + * @ingroup ell + */ +namespace ell { + + +constexpr int default_block_size = 512; + + +// TODO: num_threads_per_core and ratio are parameters should be tuned +/** + * num_threads_per_core is the oversubscribing parameter. There are + * `num_threads_per_core` threads assigned to each physical core. + */ +constexpr int num_threads_per_core = 4; + + +/** + * ratio is the parameter to decide when to use threads to do reduction on each + * row. (#cols/#rows > ratio) + */ +constexpr double ratio = 1e-2; + + +/** + * max_thread_per_worker is the max number of thread per worker. The + * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> + */ +constexpr int max_thread_per_worker = 32; + + +/** + * A compile-time list of sub-warp sizes for which the spmv kernels should be + * compiled. + * 0 is a special case where it uses a sub-warp size of warp_size in + * combination with atomic_adds. + */ +using compiled_kernels = syn::value_list; + + +#include "common/matrix/ell_kernels.hpp.inc" + + +namespace { + + +template +void abstract_spmv(syn::value_list, int num_worker_per_row, + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) +{ + const auto nrows = a->get_size()[0]; + constexpr int num_thread_per_worker = + (info == 0) ? max_thread_per_worker : info; + constexpr bool atomic = (info == 0); + const dim3 block_size(default_block_size / num_thread_per_worker, + num_thread_per_worker, 1); + const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), + b->get_size()[1], 1); + if (alpha == nullptr && beta == nullptr) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::spmv), + dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + a->get_stride(), a->get_num_stored_elements_per_row(), + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(c->get_values()), c->get_stride()); + } else if (alpha != nullptr && beta != nullptr) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::spmv), + dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + a->get_stride(), a->get_num_stored_elements_per_row(), + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()), + c->get_stride()); + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv); + + +template +std::array compute_thread_worker_and_atomicity( + std::shared_ptr exec, + const matrix::Ell *a) +{ + int num_thread_per_worker = 1; + int atomic = 0; + int num_worker_per_row = 1; + + const auto nrows = a->get_size()[0]; + const auto ell_ncols = a->get_num_stored_elements_per_row(); + // TODO: num_threads_per_core should be tuned for AMD gpu + const auto nwarps = exec->get_num_warps_per_sm() * + exec->get_num_multiprocessor() * num_threads_per_core; + + // Use multithreads to perform the reduction on each row when the matrix is + // wide. + // To make every thread have computation, so pick the value which is the + // power of 2 less than max_thread_per_worker and is less than or equal to + // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and + // allow more than one worker to work on the same row, use atomic add to + // handle the worker write the value into the same position. The #worker is + // decided according to the number of worker allowed on GPU. + if (static_cast(ell_ncols) / nrows > ratio) { + while (num_thread_per_worker < max_thread_per_worker && + (num_thread_per_worker << 1) <= ell_ncols) { + num_thread_per_worker <<= 1; + } + if (num_thread_per_worker == max_thread_per_worker) { + num_worker_per_row = + std::min(ell_ncols / max_thread_per_worker, nwarps / nrows); + num_worker_per_row = std::max(num_worker_per_row, 1); + } + if (num_worker_per_row > 1) { + atomic = 1; + } + } + return {num_thread_per_worker, atomic, num_worker_per_row}; +} + + +} // namespace + + +template +void spmv(std::shared_ptr exec, + const matrix::Ell *a, + const matrix::Dense *b, matrix::Dense *c) +{ + const auto data = compute_thread_worker_and_atomicity(exec, a); + const int num_thread_per_worker = std::get<0>(data); + const int atomic = std::get<1>(data); + const int num_worker_per_row = std::get<2>(data); + + /** + * info is the parameter for selecting the hip kernel. + * for info == 0, it uses the kernel by warp_size threads with atomic + * operation for other value, it uses the kernel without atomic_add + */ + const int info = (!atomic) * num_thread_per_worker; + if (atomic) { + components::fill_array(exec, c->get_values(), + c->get_num_stored_elements(), zero()); + } + select_abstract_spmv( + compiled_kernels(), + [&info](int compiled_info) { return info == compiled_info; }, + syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, + c); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) +{ + const auto data = compute_thread_worker_and_atomicity(exec, a); + const int num_thread_per_worker = std::get<0>(data); + const int atomic = std::get<1>(data); + const int num_worker_per_row = std::get<2>(data); + + /** + * info is the parameter for selecting the hip kernel. + * for info == 0, it uses the kernel by warp_size threads with atomic + * operation for other value, it uses the kernel without atomic_add + */ + const int info = (!atomic) * num_thread_per_worker; + if (atomic) { + dense::scale(exec, beta, c); + } + select_abstract_spmv( + compiled_kernels(), + [&info](int compiled_info) { return info == compiled_info; }, + syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, c, + alpha, beta); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); + + +template +void convert_to_dense(std::shared_ptr exec, + const matrix::Ell *source, + matrix::Dense *result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto result_stride = result->get_stride(); + const auto col_idxs = source->get_const_col_idxs(); + const auto vals = source->get_const_values(); + const auto source_stride = source->get_stride(); + + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); + const dim3 init_grid_dim(ceildiv(result_stride, block_size.x), + ceildiv(num_rows, block_size.y), 1); + hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), + dim3(block_size), 0, 0, num_rows, num_cols, + result_stride, as_hip_type(result->get_values())); + + const auto grid_dim = ceildiv(num_rows, default_block_size); + hipLaunchKernelGGL(kernel::fill_in_dense, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + source->get_num_stored_elements_per_row(), source_stride, + as_hip_type(col_idxs), as_hip_type(vals), result_stride, + as_hip_type(result->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); + + +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Ell *source, + matrix::Csr *result) +{ + auto num_rows = result->get_size()[0]; + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + const auto stride = source->get_stride(); + const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); + + constexpr auto rows_per_block = + ceildiv(default_block_size, config::warp_size); + const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); + + hipLaunchKernelGGL( + kernel::count_nnz_per_row, dim3(grid_dim_nnz), dim3(default_block_size), + 0, 0, num_rows, max_nnz_per_row, stride, + as_hip_type(source->get_const_values()), as_hip_type(row_ptrs)); + + size_type grid_dim = ceildiv(num_rows + 1, default_block_size); + auto add_values = Array(exec, grid_dim); + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + + hipLaunchKernelGGL( + kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, max_nnz_per_row, stride, + as_hip_type(source->get_const_values()), + as_hip_type(source->get_const_col_idxs()), as_hip_type(row_ptrs), + as_hip_type(col_idxs), as_hip_type(values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::Ell *source, + size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); + + +template +void calculate_nonzeros_per_row(std::shared_ptr exec, + const matrix::Ell *source, + Array *result) +{ + const auto num_rows = source->get_size()[0]; + const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); + const auto stride = source->get_stride(); + const auto values = source->get_const_values(); + + const auto warp_size = config::warp_size; + const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size); + + hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, + max_nnz_per_row, stride, as_hip_type(values), + as_hip_type(result->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +} // namespace ell +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp new file mode 100644 index 00000000000..e9efb0eb8ee --- /dev/null +++ b/hip/matrix/hybrid_kernels.hip.cpp @@ -0,0 +1,194 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/hybrid_kernels.hpp" + + +#include + + +#include +#include + + +#include "core/components/fill_array.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_kernels.hpp" +#include "core/matrix/ell_kernels.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/format_conversion.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/segment_scan.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Hybrid matrix format namespace. + * + * @ingroup hybrid + */ +namespace hybrid { + + +constexpr int default_block_size = 512; +constexpr int warps_in_block = 4; + + +#include "common/matrix/hybrid_kernels.hpp.inc" + + +template +void convert_to_dense(std::shared_ptr exec, + const matrix::Hybrid *source, + matrix::Dense *result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); + + +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Hybrid *source, + matrix::Csr *result) +{ + const auto num_rows = source->get_size()[0]; + auto coo_offset = Array(exec, num_rows + 1); + auto coo_val = source->get_const_coo_values(); + auto coo_col = source->get_const_coo_col_idxs(); + auto coo_row = source->get_const_coo_row_idxs(); + auto ell_val = source->get_const_ell_values(); + auto ell_col = source->get_const_ell_col_idxs(); + const auto stride = source->get_ell_stride(); + const auto max_nnz_per_row = source->get_ell_num_stored_elements_per_row(); + const auto coo_num_stored_elements = source->get_coo_num_stored_elements(); + + // Compute the row offset of Coo without zeros + size_type grid_num = ceildiv(coo_num_stored_elements, default_block_size); + hipLaunchKernelGGL(coo::kernel::convert_row_idxs_to_ptrs, dim3(grid_num), + dim3(default_block_size), 0, 0, as_hip_type(coo_row), + coo_num_stored_elements, + as_hip_type(coo_offset.get_data()), num_rows + 1); + + // Compute the row ptrs of Csr + auto row_ptrs = result->get_row_ptrs(); + auto coo_row_ptrs = Array(exec, num_rows); + + components::fill_array(exec, row_ptrs, num_rows + 1, zero()); + grid_num = ceildiv(num_rows, warps_in_block); + hipLaunchKernelGGL(ell::kernel::count_nnz_per_row, dim3(grid_num), + dim3(default_block_size), 0, 0, num_rows, + max_nnz_per_row, stride, as_hip_type(ell_val), + as_hip_type(row_ptrs)); + + components::fill_array(exec, coo_row_ptrs.get_data(), num_rows, + zero()); + + auto nwarps = + coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements); + if (nwarps > 0) { + int num_lines = + ceildiv(coo_num_stored_elements, nwarps * config::warp_size); + const dim3 coo_block(config::warp_size, warps_in_block, 1); + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); + + hipLaunchKernelGGL( + kernel::count_coo_row_nnz, dim3(coo_grid), dim3(coo_block), 0, 0, + coo_num_stored_elements, num_lines, as_hip_type(coo_val), + as_hip_type(coo_row), as_hip_type(coo_row_ptrs.get_data())); + } + + hipLaunchKernelGGL(kernel::add, dim3(grid_num), dim3(default_block_size), 0, + 0, num_rows, as_hip_type(row_ptrs), + as_hip_type(coo_row_ptrs.get_const_data())); + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + + // Fill the value + grid_num = ceildiv(num_rows, default_block_size); + hipLaunchKernelGGL( + kernel::fill_in_csr, dim3(grid_num), dim3(default_block_size), 0, 0, + num_rows, max_nnz_per_row, stride, as_hip_type(ell_val), + as_hip_type(ell_col), as_hip_type(coo_val), as_hip_type(coo_col), + as_hip_type(coo_offset.get_const_data()), as_hip_type(row_ptrs), + as_hip_type(result->get_col_idxs()), as_hip_type(result->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::Hybrid *source, + size_type *result) +{ + size_type ell_nnz = 0; + size_type coo_nnz = 0; + ell::count_nonzeros(exec, source->get_ell(), &ell_nnz); + + auto nnz = source->get_coo_num_stored_elements(); + auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz); + if (nwarps > 0) { + int num_lines = ceildiv(nnz, nwarps * config::warp_size); + const dim3 coo_block(config::warp_size, warps_in_block, 1); + const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + components::fill_array(exec, nnz_per_row.get_data(), num_rows, + zero()); + hipLaunchKernelGGL(kernel::count_coo_row_nnz, dim3(coo_grid), + dim3(coo_block), 0, 0, nnz, num_lines, + as_hip_type(source->get_coo()->get_const_values()), + as_hip_type(source->get_coo()->get_const_row_idxs()), + as_hip_type(nnz_per_row.get_data())); + + coo_nnz = + reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); + } + + *result = ell_nnz + coo_nnz; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); + + +} // namespace hybrid +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp new file mode 100644 index 00000000000..95a621f1886 --- /dev/null +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -0,0 +1,227 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/sellp_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The SELL-P matrix format namespace. + * + * @ingroup sellp + */ +namespace sellp { + + +constexpr auto default_block_size = 512; + + +#include "common/matrix/sellp_kernels.hpp.inc" + + +template +void spmv(std::shared_ptr exec, + const matrix::Sellp *a, + const matrix::Dense *b, matrix::Dense *c) +{ + const dim3 blockSize(matrix::default_slice_size); + const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + b->get_size()[1]); + + hipLaunchKernelGGL( + spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, a->get_size()[0], + b->get_size()[1], b->get_stride(), c->get_stride(), + a->get_const_slice_lengths(), a->get_const_slice_sets(), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(b->get_const_values()), as_hip_type(c->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Sellp *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) +{ + const dim3 blockSize(matrix::default_slice_size); + const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + b->get_size()[1]); + + hipLaunchKernelGGL( + advanced_spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, + a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), + a->get_const_slice_lengths(), a->get_const_slice_sets(), + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(b->get_const_values()), + as_hip_type(beta->get_const_values()), as_hip_type(c->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); + + +template +void convert_to_dense(std::shared_ptr exec, + const matrix::Sellp *source, + matrix::Dense *result) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto vals = source->get_const_values(); + const auto col_idxs = source->get_const_col_idxs(); + const auto slice_lengths = source->get_const_slice_lengths(); + const auto slice_sets = source->get_const_slice_sets(); + const auto slice_size = source->get_slice_size(); + + const auto slice_num = ceildiv(num_rows, slice_size); + + const dim3 block_size(config::warp_size, + config::max_block_size / config::warp_size, 1); + const dim3 init_grid_dim(ceildiv(result->get_stride(), block_size.x), + ceildiv(num_rows, block_size.y), 1); + + hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), + dim3(block_size), 0, 0, num_rows, num_cols, + result->get_stride(), as_hip_type(result->get_values())); + + constexpr auto threads_per_row = config::warp_size; + const auto grid_dim = + ceildiv(slice_size * slice_num * threads_per_row, default_block_size); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::fill_in_dense), + dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows, + num_cols, result->get_stride(), slice_size, + as_hip_type(slice_lengths), as_hip_type(slice_sets), + as_hip_type(col_idxs), as_hip_type(vals), + as_hip_type(result->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); + + +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Sellp *source, + matrix::Csr *result) +{ + const auto num_rows = source->get_size()[0]; + const auto slice_size = source->get_slice_size(); + const auto slice_num = ceildiv(num_rows, slice_size); + + const auto source_values = source->get_const_values(); + const auto source_slice_lengths = source->get_const_slice_lengths(); + const auto source_slice_sets = source->get_const_slice_sets(); + const auto source_col_idxs = source->get_const_col_idxs(); + + auto result_values = result->get_values(); + auto result_col_idxs = result->get_col_idxs(); + auto result_row_ptrs = result->get_row_ptrs(); + + auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); + + hipLaunchKernelGGL( + kernel::count_nnz_per_row, dim3(grid_dim), dim3(default_block_size), 0, + 0, num_rows, slice_size, as_hip_type(source_slice_sets), + as_hip_type(source_values), as_hip_type(result_row_ptrs)); + + components::prefix_sum(exec, result_row_ptrs, num_rows + 1); + + grid_dim = ceildiv(num_rows, default_block_size); + + hipLaunchKernelGGL( + kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, slice_size, as_hip_type(source_slice_sets), + as_hip_type(source_col_idxs), as_hip_type(source_values), + as_hip_type(result_row_ptrs), as_hip_type(result_col_idxs), + as_hip_type(result_values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::Sellp *source, + size_type *result) +{ + const auto num_rows = source->get_size()[0]; + const auto slice_size = source->get_slice_size(); + const auto slice_sets = source->get_const_slice_sets(); + const auto values = source->get_const_values(); + + auto nnz_per_row = Array(exec, num_rows); + + auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); + + hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, slice_size, + as_hip_type(slice_sets), as_hip_type(values), + as_hip_type(nnz_per_row.get_data())); + + *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); + + +} // namespace sellp +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp new file mode 100644 index 00000000000..8ab3066f1ff --- /dev/null +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -0,0 +1,124 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/sparsity_csr_kernels.hpp" + + +#include + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Compressed sparse row matrix format namespace. + * + * @ingroup sparsity + */ +namespace sparsity_csr { + + +template +void spmv(std::shared_ptr exec, + const matrix::SparsityCsr *a, + const matrix::Dense *b, + matrix::Dense *c) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::SparsityCsr *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); + + +template +void count_num_diagonal_elements( + std::shared_ptr exec, + const matrix::SparsityCsr *matrix, + size_type *num_diagonal_elements) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL); + + +template +void remove_diagonal_elements( + std::shared_ptr exec, const IndexType *row_ptrs, + const IndexType *col_idxs, + matrix::SparsityCsr *matrix) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::SparsityCsr *orig, + matrix::SparsityCsr *trans) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); + + +template +void sort_by_column_index(std::shared_ptr exec, + matrix::SparsityCsr *to_sort) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); + + +template +void is_sorted_by_column_index( + std::shared_ptr exec, + const matrix::SparsityCsr *to_check, + bool *is_sorted) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); + + +} // namespace sparsity_csr +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp new file mode 100644 index 00000000000..f2289eba530 --- /dev/null +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -0,0 +1,166 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/isai_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Isai preconditioner namespace. + * @ref Isai + * @ingroup isai + */ +namespace isai { + + +constexpr int subwarp_size{row_size_limit}; +constexpr int subwarps_per_block{2}; +constexpr int default_block_size{subwarps_per_block * subwarp_size}; + + +#include "common/preconditioner/isai_kernels.hpp.inc" + + +template +void generate_tri_inverse(std::shared_ptr exec, + const matrix::Csr *input, + matrix::Csr *inverse, + IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs, + bool lower) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + if (lower) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::generate_l_inverse), + grid, block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), + inverse->get_col_idxs(), as_hip_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::generate_u_inverse), + grid, block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), + inverse->get_col_idxs(), as_hip_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs); + } + components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); + components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); + + +template +void generate_excess_system(std::shared_ptr exec, + const matrix::Csr *input, + const matrix::Csr *inverse, + const IndexType *excess_rhs_ptrs, + const IndexType *excess_nz_ptrs, + matrix::Csr *excess_system, + matrix::Dense *excess_rhs) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate_excess_system), grid, + block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), inverse->get_const_row_ptrs(), + inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs, + excess_system->get_row_ptrs(), excess_system->get_col_idxs(), + as_hip_type(excess_system->get_values()), + as_hip_type(excess_rhs->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); + + +template +void scatter_excess_solution(std::shared_ptr exec, + const IndexType *excess_rhs_ptrs, + const matrix::Dense *excess_solution, + matrix::Csr *inverse) +{ + const auto num_rows = inverse->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::copy_excess_solution), grid, + block, 0, 0, static_cast(num_rows), + inverse->get_const_row_ptrs(), excess_rhs_ptrs, + as_hip_type(excess_solution->get_const_values()), + as_hip_type(inverse->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp new file mode 100644 index 00000000000..d7d3e87970c --- /dev/null +++ b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp @@ -0,0 +1,149 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" + + +namespace { + + +template +void advanced_apply( + syn::value_list, size_type num_blocks, + const precision_reduction *block_precisions, + const IndexType *block_pointers, const ValueType *blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + const ValueType *alpha, const ValueType *b, size_type b_stride, + ValueType *x, size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::advanced_adaptive_apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(alpha), as_hip_type(b), b_stride, as_hip_type(x), + x_stride); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::advanced_apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, as_hip_type(alpha), + as_hip_type(b), b_stride, as_hip_type(x), x_stride); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply); + + +} // namespace + + +template +void apply(std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + const Array &block_precisions, + const Array &block_pointers, + const Array &blocks, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, matrix::Dense *x) +{ + // TODO: write a special kernel for multiple RHS + dense::scale(exec, beta, x); + for (size_type col = 0; col < b->get_size()[1]; ++col) { + select_advanced_apply( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, alpha->get_const_values(), + b->get_const_values() + col, b->get_stride(), x->get_values() + col, + x->get_stride()); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_common.hip.hpp b/hip/preconditioner/jacobi_common.hip.hpp new file mode 100644 index 00000000000..d81dd3f9e97 --- /dev/null +++ b/hip/preconditioner/jacobi_common.hip.hpp @@ -0,0 +1,67 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +#include "hip/base/config.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace jacobi { + + +/** + * A compile-time list of block sizes for which dedicated generate and apply + * kernels should be compiled. + */ +#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS +using compiled_kernels = syn::as_list>; +#else +using compiled_kernels = + syn::value_list; +#endif + + +constexpr int get_larger_power(int value, int guess = 1) +{ + return guess >= value ? guess : get_larger_power(value, guess << 1); +} + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp new file mode 100644 index 00000000000..6f8def4af6e --- /dev/null +++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp @@ -0,0 +1,150 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include +#include + + +#include "core/base/extended_float.hpp" +#include "core/components/fill_array.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/diagonal_block_manipulation.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/preconditioner/jacobi_generate_kernel.hpp.inc" + + +namespace { + + +template +void generate(syn::value_list, + const matrix::Csr *mtx, + remove_complex accuracy, ValueType *block_data, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + remove_complex *conditioning, + precision_reduction *block_precisions, + const IndexType *block_ptrs, size_type num_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::adaptive_generate), + dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_hip_type(mtx->get_const_values()), as_hip_type(accuracy), + as_hip_type(block_data), storage_scheme, as_hip_type(conditioning), + block_precisions, block_ptrs, num_blocks); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate), + dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_hip_type(mtx->get_const_values()), as_hip_type(block_data), + storage_scheme, block_ptrs, num_blocks); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate); + + +} // namespace + + +template +void generate(std::shared_ptr exec, + const matrix::Csr *system_matrix, + size_type num_blocks, uint32 max_block_size, + remove_complex accuracy, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array> &conditioning, + Array &block_precisions, + const Array &block_pointers, Array &blocks) +{ + components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(), + zero()); + select_generate( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), syn::type_list<>(), + system_matrix, accuracy, blocks.get_data(), storage_scheme, + conditioning.get_data(), block_precisions.get_data(), + block_pointers.get_const_data(), num_blocks); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_GENERATE_KERNEL); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp new file mode 100644 index 00000000000..b2d249f12b7 --- /dev/null +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -0,0 +1,262 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { +namespace { + + +// a total of 32/16 warps (1024 threads) +#if GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // GINKGO_HIP_PLATFORM_NVCC +constexpr int default_num_warps = 32; +#endif +// with current architectures, at most 32 warps can be scheduled per SM (and +// current GPUs have at most 84 SMs) +constexpr int default_grid_size = 32 * 32 * 128; + + +#include "common/preconditioner/jacobi_kernels.hpp.inc" + + +template +size_type find_natural_blocks(std::shared_ptr exec, + const matrix::Csr *mtx, + int32 max_block_size, + IndexType *__restrict__ block_ptrs) +{ + Array nums(exec, 1); + + Array matching_next_row(exec, mtx->get_size()[0] - 1); + + const dim3 block_size(config::warp_size, 1, 1); + const dim3 grid_size( + ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1); + hipLaunchKernelGGL(compare_adjacent_rows, dim3(grid_size), dim3(block_size), + 0, 0, mtx->get_size()[0], max_block_size, + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + matching_next_row.get_data()); + hipLaunchKernelGGL(generate_natural_block_pointer, dim3(1), dim3(1), 0, 0, + mtx->get_size()[0], max_block_size, + matching_next_row.get_const_data(), block_ptrs, + nums.get_data()); + nums.set_executor(exec->get_master()); + return nums.get_const_data()[0]; +} + + +template +inline size_type agglomerate_supervariables( + std::shared_ptr exec, int32 max_block_size, + size_type num_natural_blocks, IndexType *block_ptrs) +{ + Array nums(exec, 1); + + hipLaunchKernelGGL(agglomerate_supervariables_kernel, dim3(1), dim3(1), 0, + 0, max_block_size, num_natural_blocks, block_ptrs, + nums.get_data()); + + nums.set_executor(exec->get_master()); + return nums.get_const_data()[0]; +} + + +} // namespace + + +void initialize_precisions(std::shared_ptr exec, + const Array &source, + Array &precisions) +{ + const auto block_size = default_num_warps * config::warp_size; + const auto grid_size = min( + default_grid_size, + static_cast(ceildiv(precisions.get_num_elems(), block_size))); + hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array), + dim3(grid_size), dim3(block_size), 0, 0, + source.get_const_data(), source.get_num_elems(), + precisions.get_data(), precisions.get_num_elems()); +} + + +template +void find_blocks(std::shared_ptr exec, + const matrix::Csr *system_matrix, + uint32 max_block_size, size_type &num_blocks, + Array &block_pointers) +{ + auto num_natural_blocks = find_natural_blocks( + exec, system_matrix, max_block_size, block_pointers.get_data()); + num_blocks = agglomerate_supervariables( + exec, max_block_size, num_natural_blocks, block_pointers.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); + + +namespace { + + +template +void transpose_jacobi( + syn::value_list, size_type num_blocks, + const precision_reduction *block_precisions, + const IndexType *block_pointers, const ValueType *blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + ValueType *out_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + adaptive_transpose_jacobi), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(out_blocks)); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(transpose_jacobi), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, + as_hip_type(out_blocks)); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi); + + +} // namespace + + +template +void transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + select_transpose_jacobi( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, out_blocks.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); + + +template +void conj_transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + select_transpose_jacobi( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, out_blocks.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); + + +template +void convert_to_dense( + std::shared_ptr exec, size_type num_blocks, + const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + ValueType *result_values, size_type result_stride) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp new file mode 100644 index 00000000000..0c2fefb1afc --- /dev/null +++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp @@ -0,0 +1,143 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/preconditioner/jacobi_simple_apply_kernel.hpp.inc" + + +namespace { + + +template +void apply(syn::value_list, size_type num_blocks, + const precision_reduction *block_precisions, + const IndexType *block_pointers, const ValueType *blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + const ValueType *b, size_type b_stride, ValueType *x, + size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::adaptive_apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(b), b_stride, as_hip_type(x), x_stride); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, as_hip_type(b), + b_stride, as_hip_type(x), x_stride); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply); + + +} // namespace + + +template +void simple_apply( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const matrix::Dense *b, matrix::Dense *x) +{ + // TODO: write a special kernel for multiple RHS + for (size_type col = 0; col < b->get_size()[1]; ++col) { + select_apply( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, b->get_const_values() + col, b->get_stride(), + x->get_values() + col, x->get_stride()); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/bicg_kernels.hip.cpp b/hip/solver/bicg_kernels.hip.cpp new file mode 100644 index 00000000000..e773520b258 --- /dev/null +++ b/hip/solver/bicg_kernels.hip.cpp @@ -0,0 +1,149 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/bicg_kernels.hpp" + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The BICG solver namespace. + * + * @ingroup bicg + */ +namespace bicg { + + +constexpr int default_block_size = 512; + + +#include "common/solver/bicg_kernels.hpp.inc" + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *z, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *prev_rho, + matrix::Dense *rho, matrix::Dense *r2, + matrix::Dense *z2, matrix::Dense *p2, + matrix::Dense *q2, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0, + b->get_size()[0], b->get_size()[1], b->get_stride(), + as_hip_type(b->get_const_values()), as_hip_type(r->get_values()), + as_hip_type(z->get_values()), as_hip_type(p->get_values()), + as_hip_type(q->get_values()), as_hip_type(r2->get_values()), + as_hip_type(z2->get_values()), as_hip_type(p2->get_values()), + as_hip_type(q2->get_values()), as_hip_type(prev_rho->get_values()), + as_hip_type(rho->get_values()), as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + matrix::Dense *p, const matrix::Dense *z, + matrix::Dense *p2, const matrix::Dense *z2, + const matrix::Dense *rho, + const matrix::Dense *prev_rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), + as_hip_type(p->get_values()), as_hip_type(z->get_const_values()), + as_hip_type(p2->get_values()), as_hip_type(z2->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(prev_rho->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + matrix::Dense *x, matrix::Dense *r, + matrix::Dense *r2, const matrix::Dense *p, + const matrix::Dense *q, + const matrix::Dense *q2, + const matrix::Dense *beta, + const matrix::Dense *rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(), + as_hip_type(x->get_values()), as_hip_type(r->get_values()), + as_hip_type(r2->get_values()), as_hip_type(p->get_const_values()), + as_hip_type(q->get_const_values()), as_hip_type(q2->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); + + +} // namespace bicg +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/bicgstab_kernels.hip.cpp b/hip/solver/bicgstab_kernels.hip.cpp new file mode 100644 index 00000000000..a8776876f6f --- /dev/null +++ b/hip/solver/bicgstab_kernels.hip.cpp @@ -0,0 +1,205 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/bicgstab_kernels.hpp" + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The BICGSTAB solver namespace. + * + * @ingroup bicgstab + */ +namespace bicgstab { + + +constexpr int default_block_size = 512; + + +#include "common/solver/bicgstab_kernels.hpp.inc" + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *rr, matrix::Dense *y, + matrix::Dense *s, matrix::Dense *t, + matrix::Dense *z, matrix::Dense *v, + matrix::Dense *p, matrix::Dense *prev_rho, + matrix::Dense *rho, matrix::Dense *alpha, + matrix::Dense *beta, matrix::Dense *gamma, + matrix::Dense *omega, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0, + b->get_size()[0], b->get_size()[1], b->get_stride(), + as_hip_type(b->get_const_values()), as_hip_type(r->get_values()), + as_hip_type(rr->get_values()), as_hip_type(y->get_values()), + as_hip_type(s->get_values()), as_hip_type(t->get_values()), + as_hip_type(z->get_values()), as_hip_type(v->get_values()), + as_hip_type(p->get_values()), as_hip_type(prev_rho->get_values()), + as_hip_type(rho->get_values()), as_hip_type(alpha->get_values()), + as_hip_type(beta->get_values()), as_hip_type(gamma->get_values()), + as_hip_type(omega->get_values()), as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + const matrix::Dense *r, matrix::Dense *p, + const matrix::Dense *v, + const matrix::Dense *rho, + const matrix::Dense *prev_rho, + const matrix::Dense *alpha, + const matrix::Dense *omega, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(r->get_size()[0] * r->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL(step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0, + r->get_size()[0], r->get_size()[1], r->get_stride(), + as_hip_type(r->get_const_values()), + as_hip_type(p->get_values()), + as_hip_type(v->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(prev_rho->get_const_values()), + as_hip_type(alpha->get_const_values()), + as_hip_type(omega->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + const matrix::Dense *r, matrix::Dense *s, + const matrix::Dense *v, + const matrix::Dense *rho, + matrix::Dense *alpha, + const matrix::Dense *beta, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(r->get_size()[0] * r->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0, + r->get_size()[0], r->get_size()[1], r->get_stride(), + as_hip_type(r->get_const_values()), as_hip_type(s->get_values()), + as_hip_type(v->get_const_values()), + as_hip_type(rho->get_const_values()), as_hip_type(alpha->get_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_2_KERNEL); + + +template +void step_3( + std::shared_ptr exec, matrix::Dense *x, + matrix::Dense *r, const matrix::Dense *s, + const matrix::Dense *t, const matrix::Dense *y, + const matrix::Dense *z, const matrix::Dense *alpha, + const matrix::Dense *beta, const matrix::Dense *gamma, + matrix::Dense *omega, const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(r->get_size()[0] * r->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_3_kernel, dim3(grid_size), dim3(block_size), 0, 0, + r->get_size()[0], r->get_size()[1], r->get_stride(), x->get_stride(), + as_hip_type(x->get_values()), as_hip_type(r->get_values()), + as_hip_type(s->get_const_values()), as_hip_type(t->get_const_values()), + as_hip_type(y->get_const_values()), as_hip_type(z->get_const_values()), + as_hip_type(alpha->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(gamma->get_const_values()), + as_hip_type(omega->get_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL); + + +template +void finalize(std::shared_ptr exec, + matrix::Dense *x, const matrix::Dense *y, + const matrix::Dense *alpha, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(y->get_size()[0] * y->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL(finalize_kernel, dim3(grid_size), dim3(block_size), 0, 0, + y->get_size()[0], y->get_size()[1], y->get_stride(), + x->get_stride(), as_hip_type(x->get_values()), + as_hip_type(y->get_const_values()), + as_hip_type(alpha->get_const_values()), + as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); + + +} // namespace bicgstab +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/cg_kernels.hip.cpp b/hip/solver/cg_kernels.hip.cpp new file mode 100644 index 00000000000..688a6ab7f49 --- /dev/null +++ b/hip/solver/cg_kernels.hip.cpp @@ -0,0 +1,141 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/cg_kernels.hpp" + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The CG solver namespace. + * + * @ingroup cg + */ +namespace cg { + + +constexpr int default_block_size = 512; + + +#include "common/solver/cg_kernels.hpp.inc" + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *z, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *prev_rho, + matrix::Dense *rho, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0, + b->get_size()[0], b->get_size()[1], b->get_stride(), + as_hip_type(b->get_const_values()), as_hip_type(r->get_values()), + as_hip_type(z->get_values()), as_hip_type(p->get_values()), + as_hip_type(q->get_values()), as_hip_type(prev_rho->get_values()), + as_hip_type(rho->get_values()), as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + matrix::Dense *p, const matrix::Dense *z, + const matrix::Dense *rho, + const matrix::Dense *prev_rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL(step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), + as_hip_type(p->get_values()), + as_hip_type(z->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(prev_rho->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + matrix::Dense *x, matrix::Dense *r, + const matrix::Dense *p, + const matrix::Dense *q, + const matrix::Dense *beta, + const matrix::Dense *rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(), + as_hip_type(x->get_values()), as_hip_type(r->get_values()), + as_hip_type(p->get_const_values()), as_hip_type(q->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_STEP_2_KERNEL); + + +} // namespace cg +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/cgs_kernels.hip.cpp b/hip/solver/cgs_kernels.hip.cpp new file mode 100644 index 00000000000..b5597777790 --- /dev/null +++ b/hip/solver/cgs_kernels.hip.cpp @@ -0,0 +1,176 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/cgs_kernels.hpp" + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The CGS solver namespace. + * + * @ingroup cgs + */ +namespace cgs { + + +constexpr int default_block_size = 512; + + +#include "common/solver/cgs_kernels.hpp.inc" + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *r_tld, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *u, + matrix::Dense *u_hat, + matrix::Dense *v_hat, matrix::Dense *t, + matrix::Dense *alpha, matrix::Dense *beta, + matrix::Dense *gamma, + matrix::Dense *rho_prev, + matrix::Dense *rho, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0, + b->get_size()[0], b->get_size()[1], b->get_stride(), + as_hip_type(b->get_const_values()), as_hip_type(r->get_values()), + as_hip_type(r_tld->get_values()), as_hip_type(p->get_values()), + as_hip_type(q->get_values()), as_hip_type(u->get_values()), + as_hip_type(u_hat->get_values()), as_hip_type(v_hat->get_values()), + as_hip_type(t->get_values()), as_hip_type(alpha->get_values()), + as_hip_type(beta->get_values()), as_hip_type(gamma->get_values()), + as_hip_type(rho_prev->get_values()), as_hip_type(rho->get_values()), + as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + const matrix::Dense *r, matrix::Dense *u, + matrix::Dense *p, const matrix::Dense *q, + matrix::Dense *beta, const matrix::Dense *rho, + const matrix::Dense *rho_prev, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), + as_hip_type(r->get_const_values()), as_hip_type(u->get_values()), + as_hip_type(p->get_values()), as_hip_type(q->get_const_values()), + as_hip_type(beta->get_values()), as_hip_type(rho->get_const_values()), + as_hip_type(rho_prev->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + const matrix::Dense *u, + const matrix::Dense *v_hat, matrix::Dense *q, + matrix::Dense *t, matrix::Dense *alpha, + const matrix::Dense *rho, + const matrix::Dense *gamma, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(u->get_size()[0] * u->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0, + u->get_size()[0], u->get_size()[1], u->get_stride(), + as_hip_type(u->get_const_values()), + as_hip_type(v_hat->get_const_values()), as_hip_type(q->get_values()), + as_hip_type(t->get_values()), as_hip_type(alpha->get_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(gamma->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_2_KERNEL); + + +template +void step_3(std::shared_ptr exec, + const matrix::Dense *t, + const matrix::Dense *u_hat, matrix::Dense *r, + matrix::Dense *x, const matrix::Dense *alpha, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(t->get_size()[0] * t->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_3_kernel, dim3(grid_size), dim3(block_size), 0, 0, + t->get_size()[0], t->get_size()[1], t->get_stride(), x->get_stride(), + as_hip_type(t->get_const_values()), + as_hip_type(u_hat->get_const_values()), as_hip_type(r->get_values()), + as_hip_type(x->get_values()), as_hip_type(alpha->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL); + + +} // namespace cgs +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp new file mode 100644 index 00000000000..3bf0e56c7fa --- /dev/null +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -0,0 +1,251 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_SOLVER_COMMON_TRS_KERNELS_HIP_HPP_ +#define GKO_HIP_SOLVER_COMMON_TRS_KERNELS_HIP_HPP_ + + +#include +#include + + +#include +#include + + +#include +#include + + +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/device_guard.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace solver { + + +struct SolveStruct { + virtual void dummy(){}; +}; + + +namespace hip { + + +struct SolveStruct : gko::solver::SolveStruct { + csrsv2Info_t solve_info; + hipsparseSolvePolicy_t policy; + hipsparseMatDescr_t factor_descr; + int factor_work_size; + void *factor_work_vec; + SolveStruct() + { + factor_work_vec = nullptr; + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateMatDescr(&factor_descr)); + GKO_ASSERT_NO_HIPSPARSE_ERRORS( + hipsparseSetMatIndexBase(factor_descr, HIPSPARSE_INDEX_BASE_ZERO)); + GKO_ASSERT_NO_HIPSPARSE_ERRORS( + hipsparseSetMatType(factor_descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetMatDiagType( + factor_descr, HIPSPARSE_DIAG_TYPE_NON_UNIT)); + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrsv2Info(&solve_info)); + policy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; + } + + SolveStruct(const SolveStruct &) = delete; + + SolveStruct(SolveStruct &&) = delete; + + SolveStruct &operator=(const SolveStruct &) = delete; + + SolveStruct &operator=(SolveStruct &&) = delete; + + ~SolveStruct() + { + hipsparseDestroyMatDescr(factor_descr); + if (solve_info) { + hipsparseDestroyCsrsv2Info(solve_info); + } + if (factor_work_vec != nullptr) { + hipFree(factor_work_vec); + factor_work_vec = nullptr; + } + } +}; + + +} // namespace hip +} // namespace solver + + +namespace kernels { +namespace hip { +namespace { + + +void should_perform_transpose_kernel(std::shared_ptr exec, + bool &do_transpose) +{ + do_transpose = true; +} + + +void init_struct_kernel(std::shared_ptr exec, + std::shared_ptr &solve_struct) +{ + solve_struct = std::make_shared(); +} + + +template +void generate_kernel(std::shared_ptr exec, + const matrix::Csr *matrix, + solver::SolveStruct *solve_struct, + const gko::size_type num_rhs, bool is_upper) +{ + if (hipsparse::is_supported::value) { + if (auto hip_solve_struct = + dynamic_cast(solve_struct)) { + auto handle = exec->get_hipsparse_handle(); + if (is_upper) { + GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseSetMatFillMode( + hip_solve_struct->factor_descr, HIPSPARSE_FILL_MODE_UPPER)); + } + + { + hipsparse::pointer_mode_guard pm_guard(handle); + hipsparse::csrsv2_buffer_size( + handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], matrix->get_num_stored_elements(), + hip_solve_struct->factor_descr, matrix->get_const_values(), + matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), + hip_solve_struct->solve_info, + &hip_solve_struct->factor_work_size); + + // allocate workspace + if (hip_solve_struct->factor_work_vec != nullptr) { + exec->free(hip_solve_struct->factor_work_vec); + } + hip_solve_struct->factor_work_vec = + exec->alloc(hip_solve_struct->factor_work_size); + + hipsparse::csrsv2_analysis( + handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], matrix->get_num_stored_elements(), + hip_solve_struct->factor_descr, matrix->get_const_values(), + matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), + hip_solve_struct->solve_info, hip_solve_struct->policy, + hip_solve_struct->factor_work_vec); + } + } else { + GKO_NOT_SUPPORTED(solve_struct); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +template +void solve_kernel(std::shared_ptr exec, + const matrix::Csr *matrix, + const solver::SolveStruct *solve_struct, + matrix::Dense *trans_b, + matrix::Dense *trans_x, + const matrix::Dense *b, + matrix::Dense *x) +{ + using vec = matrix::Dense; + + if (hipsparse::is_supported::value) { + if (auto hip_solve_struct = + dynamic_cast(solve_struct)) { + ValueType one = 1.0; + auto handle = exec->get_hipsparse_handle(); + + { + hipsparse::pointer_mode_guard pm_guard(handle); + if (b->get_stride() == 1) { + hipsparse::csrsv2_solve( + handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], + matrix->get_num_stored_elements(), &one, + hip_solve_struct->factor_descr, + matrix->get_const_values(), + matrix->get_const_row_ptrs(), + matrix->get_const_col_idxs(), + hip_solve_struct->solve_info, b->get_const_values(), + x->get_values(), hip_solve_struct->policy, + hip_solve_struct->factor_work_vec); + } else { + dense::transpose(exec, b, trans_b); + dense::transpose(exec, x, trans_x); + for (IndexType i = 0; i < trans_b->get_size()[0]; i++) { + hipsparse::csrsv2_solve( + handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + matrix->get_size()[0], + matrix->get_num_stored_elements(), &one, + hip_solve_struct->factor_descr, + matrix->get_const_values(), + matrix->get_const_row_ptrs(), + matrix->get_const_col_idxs(), + hip_solve_struct->solve_info, + trans_b->get_values() + i * trans_b->get_stride(), + trans_x->get_values() + i * trans_x->get_stride(), + hip_solve_struct->policy, + hip_solve_struct->factor_work_vec); + } + dense::transpose(exec, trans_x, x); + } + } + } else { + GKO_NOT_SUPPORTED(solve_struct); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +} // namespace +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_SOLVER_COMMON_TRS_KERNELS_HIP_HPP_ diff --git a/hip/solver/fcg_kernels.hip.cpp b/hip/solver/fcg_kernels.hip.cpp new file mode 100644 index 00000000000..750aa5743d7 --- /dev/null +++ b/hip/solver/fcg_kernels.hip.cpp @@ -0,0 +1,144 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/fcg_kernels.hpp" + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The FCG solver namespace. + * + * @ingroup fcg + */ +namespace fcg { + + +constexpr int default_block_size = 512; + + +#include "common/solver/fcg_kernels.hpp.inc" + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *z, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *t, + matrix::Dense *prev_rho, + matrix::Dense *rho, matrix::Dense *rho_t, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(b->get_size()[0] * b->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + initialize_kernel, dim3(grid_size), dim3(block_size), 0, 0, + b->get_size()[0], b->get_size()[1], b->get_stride(), + as_hip_type(b->get_const_values()), as_hip_type(r->get_values()), + as_hip_type(z->get_values()), as_hip_type(p->get_values()), + as_hip_type(q->get_values()), as_hip_type(t->get_values()), + as_hip_type(prev_rho->get_values()), as_hip_type(rho->get_values()), + as_hip_type(rho_t->get_values()), as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + matrix::Dense *p, const matrix::Dense *z, + const matrix::Dense *rho_t, + const matrix::Dense *prev_rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL(step_1_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), + as_hip_type(p->get_values()), + as_hip_type(z->get_const_values()), + as_hip_type(rho_t->get_const_values()), + as_hip_type(prev_rho->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + matrix::Dense *x, matrix::Dense *r, + matrix::Dense *t, const matrix::Dense *p, + const matrix::Dense *q, + const matrix::Dense *beta, + const matrix::Dense *rho, + const Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size( + ceildiv(p->get_size()[0] * p->get_stride(), block_size.x), 1, 1); + + hipLaunchKernelGGL( + step_2_kernel, dim3(grid_size), dim3(block_size), 0, 0, + p->get_size()[0], p->get_size()[1], p->get_stride(), x->get_stride(), + as_hip_type(x->get_values()), as_hip_type(r->get_values()), + as_hip_type(t->get_values()), as_hip_type(p->get_const_values()), + as_hip_type(q->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(stop_status->get_const_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_STEP_2_KERNEL); + + +} // namespace fcg +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp new file mode 100644 index 00000000000..2780b149660 --- /dev/null +++ b/hip/solver/gmres_kernels.hip.cpp @@ -0,0 +1,350 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/gmres_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/components/fill_array.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The GMRES solver namespace. + * + * @ingroup gmres + */ +namespace gmres { + + +constexpr int default_block_size = 512; +// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block +// size limit. +constexpr int default_dot_dim = 32; +constexpr int default_dot_size = default_dot_dim * default_dot_dim; + + +#include "common/solver/gmres_kernels.hpp.inc" + + +template +void initialize_1(std::shared_ptr exec, + const matrix::Dense *b, + matrix::Dense *residual, + matrix::Dense *givens_sin, + matrix::Dense *givens_cos, + Array *stop_status, size_type krylov_dim) +{ + const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), + krylov_dim * b->get_size()[1]); + const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1); + const dim3 block_dim(default_block_size, 1, 1); + constexpr auto block_size = default_block_size; + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(initialize_1_kernel), dim3(grid_dim), + dim3(block_dim), 0, 0, b->get_size()[0], b->get_size()[1], krylov_dim, + as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_type(residual->get_values()), residual->get_stride(), + as_hip_type(givens_sin->get_values()), givens_sin->get_stride(), + as_hip_type(givens_cos->get_values()), givens_cos->get_stride(), + as_hip_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); + + +template +void initialize_2(std::shared_ptr exec, + const matrix::Dense *residual, + matrix::Dense> *residual_norm, + matrix::Dense *residual_norm_collection, + matrix::Dense *krylov_bases, + Array *final_iter_nums, size_type krylov_dim) +{ + const auto num_rows = residual->get_size()[0]; + const auto num_rhs = residual->get_size()[1]; + const dim3 grid_dim_1( + ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), + default_block_size), + 1, 1); + const dim3 block_dim(default_block_size, 1, 1); + constexpr auto block_size = default_block_size; + + residual->compute_norm2(residual_norm); + + const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1, + 1); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(initialize_2_2_kernel), dim3(grid_dim_2), + dim3(block_dim), 0, 0, residual->get_size()[0], residual->get_size()[1], + as_hip_type(residual->get_const_values()), residual->get_stride(), + as_hip_type(residual_norm->get_const_values()), + as_hip_type(residual_norm_collection->get_values()), + as_hip_type(krylov_bases->get_values()), krylov_bases->get_stride(), + as_hip_type(final_iter_nums->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); + + +template +void finish_arnoldi(std::shared_ptr exec, size_type num_rows, + matrix::Dense *krylov_bases, + matrix::Dense *hessenberg_iter, size_type iter, + const stopping_status *stop_status) +{ + const auto stride_krylov = krylov_bases->get_stride(); + const auto stride_hessenberg = hessenberg_iter->get_stride(); + auto hipblas_handle = exec->get_hipblas_handle(); + const dim3 grid_size( + ceildiv(hessenberg_iter->get_size()[1], default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 block_size(default_dot_dim, default_dot_dim); + auto next_krylov_basis = + krylov_bases->get_values() + + (iter + 1) * num_rows * hessenberg_iter->get_size()[1]; + for (size_type k = 0; k < iter + 1; ++k) { + const auto k_krylov_bases = + krylov_bases->get_const_values() + + k * num_rows * hessenberg_iter->get_size()[1]; + if (hessenberg_iter->get_size()[1] > 1) { + // TODO: this condition should be tuned + // single rhs will use vendor's dot, otherwise, use our own + // multidot_kernel which parallelize multiple rhs. + components::fill_array( + exec, hessenberg_iter->get_values() + k * stride_hessenberg, + hessenberg_iter->get_size()[1], zero()); + hipLaunchKernelGGL( + multidot_kernel, dim3(grid_size), dim3(block_size), 0, 0, k, + num_rows, hessenberg_iter->get_size()[1], + as_hip_type(k_krylov_bases), as_hip_type(next_krylov_basis), + stride_krylov, as_hip_type(hessenberg_iter->get_values()), + stride_hessenberg, as_hip_type(stop_status)); + } else { + hipblas::dot(exec->get_hipblas_handle(), num_rows, k_krylov_bases, + stride_krylov, next_krylov_basis, stride_krylov, + hessenberg_iter->get_values() + k * stride_hessenberg); + } + hipLaunchKernelGGL( + HIP_KERNEL_NAME(update_next_krylov_kernel), + dim3(ceildiv(num_rows * stride_krylov, default_block_size)), + dim3(default_block_size), 0, 0, k, num_rows, + hessenberg_iter->get_size()[1], as_hip_type(k_krylov_bases), + as_hip_type(next_krylov_basis), stride_krylov, + as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg, + as_hip_type(stop_status)); + } + // for i in 1:iter + // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) + // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) + // end + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(update_hessenberg_2_kernel), + dim3(hessenberg_iter->get_size()[1]), dim3(default_block_size), 0, 0, + iter, num_rows, hessenberg_iter->get_size()[1], + as_hip_type(next_krylov_basis), stride_krylov, + as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, + as_hip_type(stop_status)); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(update_krylov_kernel), + dim3(ceildiv(num_rows * stride_krylov, default_block_size)), + dim3(default_block_size), 0, 0, iter, num_rows, + hessenberg_iter->get_size()[1], as_hip_type(next_krylov_basis), + stride_krylov, as_hip_type(hessenberg_iter->get_const_values()), + stride_hessenberg, as_hip_type(stop_status)); + // next_krylov_basis /= hessenberg(iter, iter + 1) + // End of arnoldi +} + + +template +void givens_rotation(std::shared_ptr exec, + matrix::Dense *givens_sin, + matrix::Dense *givens_cos, + matrix::Dense *hessenberg_iter, + matrix::Dense> *residual_norm, + matrix::Dense *residual_norm_collection, + size_type iter, const Array *stop_status) +{ + // TODO: tune block_size for optimal performance + constexpr auto block_size = default_block_size; + const auto num_cols = hessenberg_iter->get_size()[1]; + const dim3 block_dim{block_size, 1, 1}; + const dim3 grid_dim{ + static_cast(ceildiv(num_cols, block_size)), 1, 1}; + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(givens_rotation_kernel), dim3(grid_dim), + dim3(block_dim), 0, 0, hessenberg_iter->get_size()[0], + hessenberg_iter->get_size()[1], iter, + as_hip_type(hessenberg_iter->get_values()), + hessenberg_iter->get_stride(), as_hip_type(givens_sin->get_values()), + givens_sin->get_stride(), as_hip_type(givens_cos->get_values()), + givens_cos->get_stride(), as_hip_type(residual_norm->get_values()), + as_hip_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride(), + as_hip_type(stop_status->get_const_data())); +} + + +template +void step_1(std::shared_ptr exec, size_type num_rows, + matrix::Dense *givens_sin, + matrix::Dense *givens_cos, + matrix::Dense> *residual_norm, + matrix::Dense *residual_norm_collection, + matrix::Dense *krylov_bases, + matrix::Dense *hessenberg_iter, size_type iter, + Array *final_iter_nums, + const Array *stop_status) +{ + hipLaunchKernelGGL( + increase_final_iteration_numbers_kernel, + dim3(static_cast( + ceildiv(final_iter_nums->get_num_elems(), default_block_size))), + dim3(default_block_size), 0, 0, + as_hip_type(final_iter_nums->get_data()), + as_hip_type(stop_status->get_const_data()), + final_iter_nums->get_num_elems()); + finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter, + stop_status->get_const_data()); + givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, + residual_norm, residual_norm_collection, iter, stop_status); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); + + +template +void solve_upper_triangular( + const matrix::Dense *residual_norm_collection, + const matrix::Dense *hessenberg, matrix::Dense *y, + const Array *final_iter_nums) +{ + // TODO: tune block_size for optimal performance + constexpr auto block_size = default_block_size; + const auto num_rhs = residual_norm_collection->get_size()[1]; + const dim3 block_dim{block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv(num_rhs, block_size)), + 1, 1}; + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(solve_upper_triangular_kernel), + dim3(grid_dim), dim3(block_dim), 0, 0, hessenberg->get_size()[1], + num_rhs, as_hip_type(residual_norm_collection->get_const_values()), + residual_norm_collection->get_stride(), + as_hip_type(hessenberg->get_const_values()), hessenberg->get_stride(), + as_hip_type(y->get_values()), y->get_stride(), + as_hip_type(final_iter_nums->get_const_data())); +} + + +template +void calculate_qy(const matrix::Dense *krylov_bases, + const matrix::Dense *y, + matrix::Dense *before_preconditioner, + const Array *final_iter_nums) +{ + const auto num_rows = before_preconditioner->get_size()[0]; + const auto num_cols = krylov_bases->get_size()[1]; + const auto num_rhs = before_preconditioner->get_size()[1]; + const auto stride_before_preconditioner = + before_preconditioner->get_stride(); + + constexpr auto block_size = default_block_size; + const dim3 grid_dim{ + static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)), + 1, 1}; + const dim3 block_dim{block_size, 1, 1}; + + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(calculate_Qy_kernel), dim3(grid_dim), + dim3(block_dim), 0, 0, num_rows, num_cols, num_rhs, + as_hip_type(krylov_bases->get_const_values()), + krylov_bases->get_stride(), as_hip_type(y->get_const_values()), + y->get_stride(), as_hip_type(before_preconditioner->get_values()), + stride_before_preconditioner, + as_hip_type(final_iter_nums->get_const_data())); + // Calculate qy + // before_preconditioner = krylov_bases * y +} + + +template +void step_2(std::shared_ptr exec, + const matrix::Dense *residual_norm_collection, + const matrix::Dense *krylov_bases, + const matrix::Dense *hessenberg, + matrix::Dense *y, + matrix::Dense *before_preconditioner, + const Array *final_iter_nums) +{ + solve_upper_triangular(residual_norm_collection, hessenberg, y, + final_iter_nums); + calculate_qy(krylov_bases, y, before_preconditioner, final_iter_nums); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); + + +} // namespace gmres +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/ir_kernels.hip.cpp b/hip/solver/ir_kernels.hip.cpp new file mode 100644 index 00000000000..5993c4b120b --- /dev/null +++ b/hip/solver/ir_kernels.hip.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/ir_kernels.hpp" + + +#include + + +#include + + +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The IR solver namespace. + * + * @ingroup ir + */ +namespace ir { + + +constexpr int default_block_size = 512; + + +#include "common/solver/ir_kernels.hpp.inc" + + +void initialize(std::shared_ptr exec, + Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1, + 1); + + hipLaunchKernelGGL(initialize_kernel, dim3(grid_size), dim3(block_size), 0, + 0, stop_status->get_num_elems(), + stop_status->get_data()); +} + + +} // namespace ir +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp new file mode 100644 index 00000000000..d4e66513ebe --- /dev/null +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -0,0 +1,110 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/lower_trs_kernels.hpp" + + +#include + + +#include +#include + + +#include +#include +#include + + +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/solver/common_trs_kernels.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The LOWER_TRS solver namespace. + * + * @ingroup lower_trs + */ +namespace lower_trs { + + +void should_perform_transpose(std::shared_ptr exec, + bool &do_transpose) +{ + should_perform_transpose_kernel(exec, do_transpose); +} + + +void init_struct(std::shared_ptr exec, + std::shared_ptr &solve_struct) +{ + init_struct_kernel(exec, solve_struct); +} + + +template +void generate(std::shared_ptr exec, + const matrix::Csr *matrix, + solver::SolveStruct *solve_struct, const gko::size_type num_rhs) +{ + generate_kernel(exec, matrix, solve_struct, num_rhs, + false); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); + + +template +void solve(std::shared_ptr exec, + const matrix::Csr *matrix, + const solver::SolveStruct *solve_struct, + matrix::Dense *trans_b, matrix::Dense *trans_x, + const matrix::Dense *b, matrix::Dense *x) +{ + solve_kernel(exec, matrix, solve_struct, trans_b, + trans_x, b, x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL); + + +} // namespace lower_trs +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp new file mode 100644 index 00000000000..0f27b6ceb28 --- /dev/null +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -0,0 +1,110 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/upper_trs_kernels.hpp" + + +#include + + +#include +#include + + +#include +#include +#include + + +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/solver/common_trs_kernels.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The UPPER_TRS solver namespace. + * + * @ingroup upper_trs + */ +namespace upper_trs { + + +void should_perform_transpose(std::shared_ptr exec, + bool &do_transpose) +{ + should_perform_transpose_kernel(exec, do_transpose); +} + + +void init_struct(std::shared_ptr exec, + std::shared_ptr &solve_struct) +{ + init_struct_kernel(exec, solve_struct); +} + + +template +void generate(std::shared_ptr exec, + const matrix::Csr *matrix, + solver::SolveStruct *solve_struct, const gko::size_type num_rhs) +{ + generate_kernel(exec, matrix, solve_struct, num_rhs, + true); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); + + +template +void solve(std::shared_ptr exec, + const matrix::Csr *matrix, + const solver::SolveStruct *solve_struct, + matrix::Dense *trans_b, matrix::Dense *trans_x, + const matrix::Dense *b, matrix::Dense *x) +{ + solve_kernel(exec, matrix, solve_struct, trans_b, + trans_x, b, x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL); + + +} // namespace upper_trs +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp new file mode 100644 index 00000000000..0c2cf4da378 --- /dev/null +++ b/hip/stop/criterion_kernels.hip.cpp @@ -0,0 +1,87 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/stop/criterion_kernels.hpp" + + +#include +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Set all statuses namespace. + * @ref set_status + * @ingroup set_all_statuses + */ +namespace set_all_statuses { + + +constexpr int default_block_size = 512; + + +__global__ __launch_bounds__(default_block_size) void set_all_statuses( + size_type num_elems, uint8 stoppingId, bool setFinalized, + stopping_status *stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_elems) { + stop_status[tidx].stop(stoppingId, setFinalized); + } +} + + +void set_all_statuses(std::shared_ptr exec, uint8 stoppingId, + bool setFinalized, Array *stop_status) +{ + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1, + 1); + + hipLaunchKernelGGL((set_all_statuses), dim3(grid_size), dim3(block_size), 0, + 0, stop_status->get_num_elems(), stoppingId, + setFinalized, as_hip_type(stop_status->get_data())); +} + + +} // namespace set_all_statuses +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp new file mode 100644 index 00000000000..d104a29d8a8 --- /dev/null +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -0,0 +1,130 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/stop/residual_norm_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Residual norm stopping criterion namespace. + * @ref resnorm + * @ingroup resnorm + */ +namespace residual_norm { + + +constexpr int default_block_size = 512; + + +template +__global__ __launch_bounds__(default_block_size) void residual_norm_kernel( + size_type num_cols, ValueType rel_residual_goal, + const ValueType *__restrict__ tau, const ValueType *__restrict__ orig_tau, + uint8 stoppingId, bool setFinalized, + stopping_status *__restrict__ stop_status, + bool *__restrict__ device_storage) +{ + const auto tidx = thread::get_thread_id_flat(); + if (tidx < num_cols) { + if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) { + stop_status[tidx].converge(stoppingId, setFinalized); + device_storage[1] = true; + } + // because only false is written to all_converged, write conflicts + // should not cause any problem + else if (!stop_status[tidx].has_stopped()) { + device_storage[0] = false; + } + } +} + + +__global__ __launch_bounds__(1) void init_kernel( + bool *__restrict__ device_storage) +{ + device_storage[0] = true; + device_storage[1] = false; +} + + +template +void residual_norm(std::shared_ptr exec, + const matrix::Dense *tau, + const matrix::Dense *orig_tau, + ValueType rel_residual_goal, uint8 stoppingId, + bool setFinalized, Array *stop_status, + Array *device_storage, bool *all_converged, + bool *one_changed) +{ + static_assert(is_complex_s::value == false, + "ValueType must not be complex in this function!"); + hipLaunchKernelGGL((init_kernel), dim3(1), dim3(1), 0, 0, + as_hip_type(device_storage->get_data())); + + const dim3 block_size(default_block_size, 1, 1); + const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); + + hipLaunchKernelGGL((residual_norm_kernel), dim3(grid_size), + dim3(block_size), 0, 0, tau->get_size()[1], + rel_residual_goal, as_hip_type(tau->get_const_values()), + as_hip_type(orig_tau->get_const_values()), stoppingId, + setFinalized, as_hip_type(stop_status->get_data()), + as_hip_type(device_storage->get_data())); + + /* Represents all_converged, one_changed */ + *all_converged = exec->copy_val_to_host(device_storage->get_const_data()); + *one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1); +} + +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( + GKO_DECLARE_RESIDUAL_NORM_KERNEL); + + +} // namespace residual_norm +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/test/CMakeLists.txt b/hip/test/CMakeLists.txt new file mode 100644 index 00000000000..fd1fa2941d8 --- /dev/null +++ b/hip/test/CMakeLists.txt @@ -0,0 +1,10 @@ +include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake) + +add_subdirectory(base) +add_subdirectory(components) +add_subdirectory(factorization) +add_subdirectory(matrix) +add_subdirectory(solver) +add_subdirectory(preconditioner) +add_subdirectory(stop) +add_subdirectory(utils) diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt new file mode 100644 index 00000000000..4719886d4d9 --- /dev/null +++ b/hip/test/base/CMakeLists.txt @@ -0,0 +1,8 @@ +ginkgo_create_hip_test(hip_executor) +ginkgo_create_hip_test(math) +# Only hcc needs the libraries. nvcc only requires the headers. +if (GINKGO_HIP_PLATFORM MATCHES "hcc") + ginkgo_create_hip_test(exception_helpers roc::hipblas roc::hipsparse) +else() + ginkgo_create_hip_test(exception_helpers) +endif() diff --git a/hip/test/base/exception_helpers.hip.cpp b/hip/test/base/exception_helpers.hip.cpp new file mode 100644 index 00000000000..8261cc24f0e --- /dev/null +++ b/hip/test/base/exception_helpers.hip.cpp @@ -0,0 +1,83 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +namespace { + + +TEST(AssertNoHipErrors, ThrowsOnError) +{ + ASSERT_THROW(GKO_ASSERT_NO_HIP_ERRORS(1), gko::HipError); +} + + +TEST(AssertNoHipErrors, DoesNotThrowOnSuccess) +{ + ASSERT_NO_THROW(GKO_ASSERT_NO_HIP_ERRORS(hipSuccess)); +} + + +TEST(AssertNoHipblasErrors, ThrowsOnError) +{ + ASSERT_THROW(GKO_ASSERT_NO_HIPBLAS_ERRORS(1), gko::HipblasError); +} + + +TEST(AssertNoHipblasErrors, DoesNotThrowOnSuccess) +{ + ASSERT_NO_THROW(GKO_ASSERT_NO_HIPBLAS_ERRORS(HIPBLAS_STATUS_SUCCESS)); +} + + +TEST(AssertNoHipsparseErrors, ThrowsOnError) +{ + ASSERT_THROW(GKO_ASSERT_NO_HIPSPARSE_ERRORS(1), gko::HipsparseError); +} + + +TEST(AssertNoHipsparseErrors, DoesNotThrowOnSuccess) +{ + ASSERT_NO_THROW(GKO_ASSERT_NO_HIPSPARSE_ERRORS(HIPSPARSE_STATUS_SUCCESS)); +} + + +} // namespace diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp new file mode 100644 index 00000000000..635639fc21e --- /dev/null +++ b/hip/test/base/hip_executor.hip.cpp @@ -0,0 +1,263 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// prevent compilation failure related to disappearing assert(...) statements +#include +// force-top: off + + +#include + + +#include +#include + + +#include + + +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class ExampleOperation : public gko::Operation { +public: + explicit ExampleOperation(int &val) : value(val) {} + + void run(std::shared_ptr) const override + { + value = -1; + } + + void run(std::shared_ptr) const override + { + value = -2; + } + + void run(std::shared_ptr) const override + { + value = -3; + } + + void run(std::shared_ptr) const override + { + hipGetDevice(&value); + } + + int &value; +}; + + +class HipExecutor : public ::testing::Test { +protected: + HipExecutor() : omp(gko::OmpExecutor::create()), hip(nullptr), hip2(nullptr) + {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + hip = gko::HipExecutor::create(0, omp); + hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, + omp); + } + + void TearDown() + { + if (hip != nullptr) { + // ensure that previous calls finished and didn't throw an error + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::shared_ptr omp; + std::shared_ptr hip; + std::shared_ptr hip2; +}; + + +TEST_F(HipExecutor, CanInstantiateTwoExecutorsOnOneDevice) +{ + auto hip = gko::HipExecutor::create(0, omp); + auto hip2 = gko::HipExecutor::create(0, omp); + + // We want automatic deinitialization to not create any error +} + + +TEST_F(HipExecutor, MasterKnowsNumberOfDevices) +{ + int count = 0; + hipGetDeviceCount(&count); + + auto num_devices = gko::HipExecutor::get_num_devices(); + + ASSERT_EQ(count, num_devices); +} + + +TEST_F(HipExecutor, AllocatesAndFreesMemory) +{ + int *ptr = nullptr; + + ASSERT_NO_THROW(ptr = hip->alloc(2)); + ASSERT_NO_THROW(hip->free(ptr)); +} + + +TEST_F(HipExecutor, FailsWhenOverallocating) +{ + const gko::size_type num_elems = 1ll << 50; // 4PB of integers + int *ptr = nullptr; + + ASSERT_THROW( + { + ptr = hip->alloc(num_elems); + hip->synchronize(); + }, + gko::AllocationError); + + hip->free(ptr); +} + + +__global__ void check_data(int *data) +{ + if (data[0] != 3 || data[1] != 8) { +#if GINKGO_HIP_PLATFORM_HCC + asm("s_trap 0x02;"); +#else // GINKGO_HIP_PLATFORM_NVCC + asm("trap;"); +#endif + } +} + +TEST_F(HipExecutor, CopiesDataToHip) +{ + int orig[] = {3, 8}; + auto *copy = hip->alloc(2); + + hip->copy_from(omp.get(), 2, orig, copy); + + hipLaunchKernelGGL((check_data), dim3(1), dim3(1), 0, 0, copy); + ASSERT_NO_THROW(hip->synchronize()); + hip->free(copy); +} + + +__global__ void init_data(int *data) +{ + data[0] = 3; + data[1] = 8; +} + +TEST_F(HipExecutor, CopiesDataFromHip) +{ + int copy[2]; + auto orig = hip->alloc(2); + hipLaunchKernelGGL((init_data), dim3(1), dim3(1), 0, 0, orig); + + omp->copy_from(hip.get(), 2, orig, copy); + + EXPECT_EQ(3, copy[0]); + ASSERT_EQ(8, copy[1]); + hip->free(orig); +} + + +/* Properly checks if it works only when multiple GPUs exist */ +TEST_F(HipExecutor, PreservesDeviceSettings) +{ + auto previous_device = gko::HipExecutor::get_num_devices() - 1; + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(previous_device)); + auto orig = hip->alloc(2); + int current_device; + GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(¤t_device)); + ASSERT_EQ(current_device, previous_device); + + hip->free(orig); + GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(¤t_device)); + ASSERT_EQ(current_device, previous_device); +} + + +TEST_F(HipExecutor, RunsOnProperDevice) +{ + int value = -1; + + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0)); + hip2->run(ExampleOperation(value)); + + ASSERT_EQ(value, hip2->get_device_id()); +} + + +TEST_F(HipExecutor, CopiesDataFromHipToHip) +{ + int copy[2]; + auto orig = hip->alloc(2); + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0)); + hipLaunchKernelGGL((init_data), dim3(1), dim3(1), 0, 0, orig); + + auto copy_hip2 = hip2->alloc(2); + hip2->copy_from(hip.get(), 2, orig, copy_hip2); + + // Check that the data is really on GPU2 and ensure we did not cheat + int value = -1; + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(hip2->get_device_id())); + hipLaunchKernelGGL((check_data), dim3(1), dim3(1), 0, 0, copy_hip2); + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0)); + hip2->run(ExampleOperation(value)); + ASSERT_EQ(value, hip2->get_device_id()); + // Put the results on OpenMP and run CPU side assertions + omp->copy_from(hip2.get(), 2, copy_hip2, copy); + EXPECT_EQ(3, copy[0]); + ASSERT_EQ(8, copy[1]); + hip->free(copy_hip2); + hip->free(orig); +} + + +TEST_F(HipExecutor, Synchronizes) +{ + // Todo design a proper unit test once we support streams + ASSERT_NO_THROW(hip->synchronize()); +} + + +} // namespace diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp new file mode 100644 index 00000000000..818506a8d25 --- /dev/null +++ b/hip/test/base/math.hip.cpp @@ -0,0 +1,169 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// prevent compilation failure related to disappearing assert(...) statements +#include +// force-top: off + + +#include + + +#include +#include +#include + + +#include + + +#include +#include + + +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { +namespace kernel { + + +template +__device__ bool test_real_is_finite_function(FuncType isfin) +{ + constexpr T inf = gko::device_numeric_limits::inf; + constexpr T quiet_nan = NAN; + bool test_true{}; + bool test_false{}; + + test_true = isfin(T{0}) && isfin(-T{0}) && isfin(T{1}); + test_false = isfin(inf) || isfin(-inf) || isfin(quiet_nan) || + isfin(inf - inf) || isfin(inf / inf) || isfin(inf * T{2}) || + isfin(T{1} / T{0}) || isfin(T{0} / T{0}); + return test_true && !test_false; +} + + +template +__device__ bool test_complex_is_finite_function(FuncType isfin) +{ + static_assert(gko::is_complex_s::value, + "Template type must be a complex type."); + using T = gko::remove_complex; + using c_type = gko::kernels::hip::hip_type; + constexpr T inf = gko::device_numeric_limits::inf; + constexpr T quiet_nan = NAN; + bool test_true{}; + bool test_false{}; + + test_true = isfin(c_type{T{0}, T{0}}) && isfin(c_type{-T{0}, -T{0}}) && + isfin(c_type{T{1}, T{0}}) && isfin(c_type{T{0}, T{1}}); + test_false = isfin(c_type{inf, T{0}}) || isfin(c_type{-inf, T{0}}) || + isfin(c_type{quiet_nan, T{0}}) || isfin(c_type{T{0}, inf}) || + isfin(c_type{T{0}, -inf}) || isfin(c_type{T{0}, quiet_nan}); + return test_true && !test_false; +} + + +} // namespace kernel + + +template +__global__ void test_real_is_finite(bool *result) +{ + *result = kernel::test_real_is_finite_function( + [](T val) { return gko::is_finite(val); }); +} + + +template +__global__ void test_complex_is_finite(bool *result) +{ + *result = kernel::test_complex_is_finite_function( + [](ComplexType val) { return gko::is_finite(val); }); +} + + +class IsFinite : public ::testing::Test { +protected: + IsFinite() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)) + {} + + template + bool test_real_is_finite_kernel() + { + gko::Array result(hip, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_real_is_finite), dim3(1), + dim3(1), 0, 0, result.get_data()); + result.set_executor(ref); + return *result.get_data(); + } + + template + bool test_complex_is_finite_kernel() + { + gko::Array result(hip, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_complex_is_finite), dim3(1), + dim3(1), 0, 0, result.get_data()); + result.set_executor(ref); + return *result.get_data(); + } + + std::shared_ptr ref; + std::shared_ptr hip; +}; + + +TEST_F(IsFinite, Float) { ASSERT_TRUE(test_real_is_finite_kernel()); } + + +TEST_F(IsFinite, Double) { ASSERT_TRUE(test_real_is_finite_kernel()); } + + +TEST_F(IsFinite, FloatComplex) +{ + ASSERT_TRUE(test_complex_is_finite_kernel>()); +} + + +TEST_F(IsFinite, DoubleComplex) +{ + ASSERT_TRUE(test_complex_is_finite_kernel>()); +} + + +} // namespace diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt new file mode 100644 index 00000000000..b3bec2595f9 --- /dev/null +++ b/hip/test/components/CMakeLists.txt @@ -0,0 +1,7 @@ +ginkgo_create_hip_test(cooperative_groups_kernels) +ginkgo_create_hip_test(fill_array) +ginkgo_create_hip_test(merging_kernels) +ginkgo_create_hip_test(precision_conversion) +ginkgo_create_hip_test(prefix_sum) +ginkgo_create_hip_test(searching_kernels) +ginkgo_create_hip_test(sorting_kernels) diff --git a/hip/test/components/cooperative_groups_kernels.hip.cpp b/hip/test/components/cooperative_groups_kernels.hip.cpp new file mode 100644 index 00000000000..823dcef0df1 --- /dev/null +++ b/hip/test/components/cooperative_groups_kernels.hip.cpp @@ -0,0 +1,343 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// TODO remove when the HIP includes are fixed +#include +// force-top: off + + +#include "hip/components/cooperative_groups.hip.hpp" + + +#include +#include + + +#include + + +#include +#include + + +#include "hip/base/types.hip.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +using namespace gko::kernels::hip; + + +class CooperativeGroups : public ::testing::Test { +protected: + CooperativeGroups() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + result(ref, 1), + dresult(hip) + { + *result.get_data() = true; + dresult = result; + } + + template + void test(Kernel kernel) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), + dim3(config::warp_size), 0, 0, dresult.get_data()); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + template + void test_subwarp(Kernel kernel) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), + dim3(config::warp_size / 2), 0, 0, + dresult.get_data()); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + std::shared_ptr ref; + std::shared_ptr hip; + gko::Array result; + gko::Array dresult; +}; + + +constexpr static int subwarp_size = config::warp_size / 4; + + +__device__ void test_assert(bool *success, bool partial) +{ + if (!partial) { + *success = false; + } +} + + +__global__ void cg_shuffle(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = int(group.thread_rank()); + test_assert(s, group.shfl_up(i, 1) == max(0, i - 1)); + test_assert(s, group.shfl_down(i, 1) == min(i + 1, config::warp_size - 1)); + test_assert(s, group.shfl(i, 0) == 0); +} + + +TEST_F(CooperativeGroups, Shuffle) { test(cg_shuffle); } + + +__global__ void cg_all(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, group.all(true)); + test_assert(s, !group.all(false)); + test_assert(s, !group.all(threadIdx.x < 13)); +} + + +TEST_F(CooperativeGroups, All) { test(cg_all); } + + +__global__ void cg_any(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, group.any(true)); + test_assert(s, group.any(threadIdx.x == 0)); + test_assert(s, !group.any(false)); +} + + +TEST_F(CooperativeGroups, Any) { test(cg_any); } + + +__global__ void cg_ballot(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, group.ballot(false) == 0); + test_assert(s, group.ballot(true) == ~config::lane_mask_type{}); + test_assert(s, group.ballot(threadIdx.x < 4) == 0xf); +} + + +TEST_F(CooperativeGroups, Ballot) { test(cg_ballot); } + + +__global__ void cg_subwarp_shuffle(bool *s) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = int(group.thread_rank()); + test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0)); + test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1)); + auto group_base = threadIdx.x / subwarp_size * subwarp_size; + test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base); + if (threadIdx.x / subwarp_size == 1) { + test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0)); + test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1)); + test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base); + } else { + test_assert(s, group.shfl_down(i, 1) == min(i + 1, subwarp_size - 1)); + test_assert(s, group.shfl(int(threadIdx.x), 0) == group_base); + test_assert(s, group.shfl_up(i, 1) == max(i - 1, 0)); + } +} + + +TEST_F(CooperativeGroups, SubwarpShuffle) { test(cg_subwarp_shuffle); } + + +TEST_F(CooperativeGroups, SubwarpShuffle2) { test_subwarp(cg_subwarp_shuffle); } + + +__global__ void cg_subwarp_all(bool *s) +{ + auto grp = threadIdx.x / subwarp_size; + bool test_grp = grp == 1; + auto i = threadIdx.x % subwarp_size; + // only test with test_grp, the other threads run 'interference' + auto group = + group::tiled_partition(group::this_thread_block()); + test_assert(s, !test_grp || group.all(test_grp)); + test_assert(s, !test_grp || !group.all(!test_grp)); + test_assert(s, !test_grp || !group.all(i < subwarp_size - 3 || !test_grp)); + if (test_grp) { + test_assert(s, group.all(true)); + test_assert(s, !group.all(false)); + test_assert(s, !group.all(i < subwarp_size - 3)); + } else { + test_assert(s, !group.all(false)); + test_assert(s, !group.all(i < subwarp_size - 3)); + test_assert(s, group.all(true)); + } +} + + +TEST_F(CooperativeGroups, SubwarpAll) { test(cg_subwarp_all); } + + +TEST_F(CooperativeGroups, SubwarpAll2) { test_subwarp(cg_subwarp_all); } + + +__global__ void cg_subwarp_any(bool *s) +{ + auto grp = threadIdx.x / subwarp_size; + bool test_grp = grp == 1; + // only test with test_grp, the other threads run 'interference' + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = group.thread_rank(); + test_assert(s, !test_grp || group.any(test_grp)); + test_assert(s, !test_grp || group.any(test_grp && i == 1)); + test_assert(s, !test_grp || !group.any(!test_grp)); + if (test_grp) { + test_assert(s, group.any(true)); + test_assert(s, group.any(i == 1)); + test_assert(s, !group.any(false)); + } else { + test_assert(s, !group.any(false)); + test_assert(s, group.any(true)); + test_assert(s, group.any(i == 1)); + } +} + + +TEST_F(CooperativeGroups, SubwarpAny) { test(cg_subwarp_any); } + + +TEST_F(CooperativeGroups, SubwarpAny2) { test_subwarp(cg_subwarp_any); } + + +__global__ void cg_subwarp_ballot(bool *s) +{ + auto grp = threadIdx.x / subwarp_size; + bool test_grp = grp == 1; + auto full_mask = (config::lane_mask_type{1} << subwarp_size) - 1; + // only test with test_grp, the other threads run 'interference' + auto group = + group::tiled_partition(group::this_thread_block()); + auto i = group.thread_rank(); + test_assert(s, !test_grp || group.ballot(!test_grp) == 0); + test_assert(s, !test_grp || group.ballot(test_grp) == full_mask); + test_assert(s, !test_grp || group.ballot(i < 4 || !test_grp) == 0xf); + if (test_grp) { + test_assert(s, group.ballot(false) == 0); + test_assert(s, group.ballot(true) == full_mask); + test_assert(s, group.ballot(i < 4) == 0xf); + } else { + test_assert(s, group.ballot(true) == full_mask); + test_assert(s, group.ballot(i < 4) == 0xf); + test_assert(s, group.ballot(false) == 0); + } +} + + +TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); } + + +TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); } + + +template +__global__ void cg_shuffle_sum(const int num, ValueType *__restrict__ value) +{ + auto group = + group::tiled_partition(group::this_thread_block()); + for (int ind = 0; ind < num; ind++) { + value[group.thread_rank()] += group.shfl(value[ind], ind); + } +} + + +TEST_F(CooperativeGroups, ShuffleSumDouble) +{ + int num = 4; + uint64_t x = 0x401022C90008B240; + double x_dbl{}; + std::memcpy(&x_dbl, &x, sizeof(x_dbl)); + gko::Array value(ref, config::warp_size); + gko::Array answer(ref, config::warp_size); + gko::Array dvalue(hip); + for (int i = 0; i < value.get_num_elems(); i++) { + value.get_data()[i] = x_dbl; + answer.get_data()[i] = value.get_data()[i] * (1 << num); + } + dvalue = value; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum), dim3(1), + dim3(config::warp_size), 0, 0, num, dvalue.get_data()); + + value = dvalue; + GKO_ASSERT_ARRAY_EQ(value, answer); +} + + +TEST_F(CooperativeGroups, ShuffleSumComplexDouble) +{ + int num = 4; + uint64_t x = 0x401022C90008B240; + double x_dbl{}; + std::memcpy(&x_dbl, &x, sizeof(x_dbl)); + gko::Array> value(ref, config::warp_size); + gko::Array> answer(ref, config::warp_size); + gko::Array> dvalue(hip); + for (int i = 0; i < value.get_num_elems(); i++) { + value.get_data()[i] = std::complex{x_dbl, x_dbl}; + answer.get_data()[i] = + std::complex{x_dbl * (1 << num), x_dbl * (1 << num)}; + } + dvalue = value; + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum>), + dim3(1), dim3(config::warp_size), 0, 0, num, + as_hip_type(dvalue.get_data())); + + value = dvalue; + GKO_ASSERT_ARRAY_EQ(value, answer); +} + + +} // namespace diff --git a/hip/test/components/fill_array.hip.cpp b/hip/test/components/fill_array.hip.cpp new file mode 100644 index 00000000000..1c7bfda89d0 --- /dev/null +++ b/hip/test/components/fill_array.hip.cpp @@ -0,0 +1,89 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// TODO remove when the HIP includes are fixed +#include +// force-top: off + + +#include "core/components/fill_array.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils/assertions.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class FillArray : public ::testing::Test { +protected: + using value_type = double; + FillArray() + : ref(gko::ReferenceExecutor::create()), + exec(gko::HipExecutor::create(0, ref)), + total_size(6344), + vals(ref, total_size), + dvals(exec, total_size) + { + std::fill_n(vals.get_data(), total_size, 1234.0); + } + + std::shared_ptr ref; + std::shared_ptr exec; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + + +TEST_F(FillArray, EqualsReference) +{ + gko::kernels::hip::components::fill_array(exec, dvals.get_data(), + total_size, 1234.0); + GKO_ASSERT_ARRAY_EQ(vals, dvals); +} + + +} // namespace diff --git a/hip/test/components/merging_kernels.hip.cpp b/hip/test/components/merging_kernels.hip.cpp new file mode 100644 index 00000000000..466c31a48b3 --- /dev/null +++ b/hip/test/components/merging_kernels.hip.cpp @@ -0,0 +1,306 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// TODO remove when the HIP includes are fixed +#include +// force-top: off + + +#include "hip/components/merging.hip.hpp" + + +#include +#include +#include +#include + + +#include + + +#include +#include + + +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +using namespace gko::kernels::hip; +using namespace gko::kernels::hip::group; + + +class Merging : public ::testing::Test { +protected: + Merging() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + rng(123456), + rng_runs{100}, + max_size{1637}, + sizes{0, 1, 2, 3, 4, 10, 15, 16, + 31, 34, 102, 242, 534, 956, 1239, 1637}, + data1(ref, max_size), + data2(ref, max_size), + outdata(ref, 2 * max_size), + idxs1(ref), + idxs2(ref), + idxs3(ref), + refidxs1(ref), + refidxs2(ref), + refidxs3(ref), + refdata(ref, 2 * max_size), + ddata1(hip), + ddata2(hip), + didxs1(hip, 2 * max_size), + didxs2(hip, 2 * max_size), + didxs3(hip, 2 * max_size), + drefidxs1(hip, 2 * max_size), + drefidxs2(hip, 2 * max_size), + drefidxs3(hip, 2 * max_size), + doutdata(hip, 2 * max_size) + {} + + void init_data(int rng_run) + { + std::uniform_int_distribution dist(0, max_size); + std::fill_n(data1.get_data(), max_size, 0); + std::fill_n(data2.get_data(), max_size, 0); + for (int i = 0; i < max_size; ++i) { + // here we also want to test some corner cases + // first two runs: zero data1 + if (rng_run > 1) data1.get_data()[i] = dist(rng); + // first and third run: zero data2 + if (rng_run > 2 || rng_run == 1) data2.get_data()[i] = dist(rng); + } + std::sort(data1.get_data(), data1.get_data() + max_size); + std::sort(data2.get_data(), data2.get_data() + max_size); + + ddata1 = data1; + ddata2 = data2; + } + + void assert_eq_ref(int size, int eq_size) + { + outdata = doutdata; + auto out_ptr = outdata.get_const_data(); + auto out_end = out_ptr + eq_size; + auto ref_ptr = refdata.get_data(); + std::copy_n(data1.get_const_data(), size, ref_ptr); + std::copy_n(data2.get_const_data(), size, ref_ptr + size); + std::sort(ref_ptr, ref_ptr + 2 * size); + + ASSERT_TRUE(std::equal(out_ptr, out_end, ref_ptr)); + } + + std::shared_ptr ref; + std::shared_ptr hip; + std::default_random_engine rng; + + int rng_runs; + int max_size; + std::vector sizes; + gko::Array data1; + gko::Array data2; + gko::Array idxs1; + gko::Array idxs2; + gko::Array idxs3; + gko::Array refidxs1; + gko::Array refidxs2; + gko::Array refidxs3; + gko::Array outdata; + gko::Array refdata; + gko::Array ddata1; + gko::Array ddata2; + gko::Array didxs1; + gko::Array didxs2; + gko::Array didxs3; + gko::Array drefidxs1; + gko::Array drefidxs2; + gko::Array drefidxs3; + gko::Array doutdata; +}; + + +__global__ void test_merge_step(const gko::int32 *a, const gko::int32 *b, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + auto i = warp.thread_rank(); + auto result = group_merge_step(a[i], b[i], warp); + c[i] = min(result.a_val, result.b_val); +} + +TEST_F(Merging, MergeStep) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), + doutdata.get_data()); + + assert_eq_ref(config::warp_size, config::warp_size); + } +} + + +__global__ void test_merge(const gko::int32 *a, const gko::int32 *b, int size, + gko::int32 *c) +{ + auto warp = tiled_partition(this_thread_block()); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + c[i] = min(a_val, b_val); + } + return true; + }); +} + +TEST_F(Merging, FullMerge) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + for (auto size : sizes) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), + size, doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_sequential_merge(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c) +{ + sequential_merge( + a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, gko::int32 b_val, int i) { + c[i] = min(a_val, b_val); + return true; + }); +} + +TEST_F(Merging, SequentialFullMerge) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + for (auto size : sizes) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1), + dim3(1), 0, 0, ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); + + assert_eq_ref(size, 2 * size); + } + } +} + + +__global__ void test_merge_idxs(const gko::int32 *a, const gko::int32 *b, + int size, gko::int32 *c, gko::int32 *aidxs, + gko::int32 *bidxs, gko::int32 *cidxs, + gko::int32 *refaidxs, gko::int32 *refbidxs, + gko::int32 *refcidxs) +{ + if (threadIdx.x == 0) { + sequential_merge(a, size, b, size, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i) { + refaidxs[i] = a_idx; + refbidxs[i] = b_idx; + refcidxs[i] = i; + return true; + }); + } + auto warp = tiled_partition(this_thread_block()); + group_merge(a, size, b, size, warp, + [&](int a_idx, gko::int32 a_val, int b_idx, + gko::int32 b_val, int i, bool valid) { + if (valid) { + aidxs[i] = a_idx; + bidxs[i] = b_idx; + cidxs[i] = i; + c[i] = min(a_val, b_val); + } + return true; + }); +} + +TEST_F(Merging, FullMergeIdxs) +{ + for (int i = 0; i < rng_runs; ++i) { + init_data(i); + for (auto size : sizes) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1), + dim3(config::warp_size), 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), + size, doutdata.get_data(), didxs1.get_data(), + didxs2.get_data(), didxs3.get_data(), + drefidxs1.get_data(), drefidxs2.get_data(), + drefidxs3.get_data()); + + assert_eq_ref(size, 2 * size); + idxs1 = didxs1; + idxs2 = didxs2; + idxs3 = didxs3; + refidxs1 = drefidxs1; + refidxs2 = drefidxs2; + refidxs3 = drefidxs3; + auto idxs1_ptr = idxs1.get_const_data(); + auto idxs2_ptr = idxs2.get_const_data(); + auto idxs3_ptr = idxs3.get_const_data(); + auto refidxs1_ptr = refidxs1.get_const_data(); + auto refidxs2_ptr = refidxs2.get_const_data(); + auto refidxs3_ptr = refidxs3.get_const_data(); + + ASSERT_TRUE( + std::equal(idxs1_ptr, idxs1_ptr + 2 * size, refidxs1_ptr)); + ASSERT_TRUE( + std::equal(idxs2_ptr, idxs2_ptr + 2 * size, refidxs2_ptr)); + ASSERT_TRUE( + std::equal(idxs3_ptr, idxs3_ptr + 2 * size, refidxs3_ptr)); + } + } +} + + +} // namespace diff --git a/hip/test/components/precision_conversion.hip.cpp b/hip/test/components/precision_conversion.hip.cpp new file mode 100644 index 00000000000..a7b9713b871 --- /dev/null +++ b/hip/test/components/precision_conversion.hip.cpp @@ -0,0 +1,173 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include +#include +#include + + +#include + + +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class PrecisionConversion : public ::testing::Test { +protected: + PrecisionConversion() + : ref(gko::ReferenceExecutor::create()), + exec(gko::HipExecutor::create(0, ref)), + rand(293), + total_size(42793), + vals(ref, total_size), + cvals(ref, total_size), + vals2(ref, 1), + expected_float(ref, 1), + expected_double(ref, 1), + dvals(exec), + dcvals(exec), + dvals2(exec) + { + auto maxval = 1e10f; + std::uniform_real_distribution dist(-maxval, maxval); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + cvals.get_data()[i] = {dist(rand), dist(rand)}; + } + dvals = vals; + dcvals = cvals; + gko::uint64 rawdouble{0x4218888000889111ULL}; + gko::uint32 rawfloat{0x50c44400UL}; + gko::uint64 rawrounded{0x4218888000000000ULL}; + std::memcpy(vals2.get_data(), &rawdouble, sizeof(double)); + std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float)); + std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double)); + dvals2 = vals2; + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; + gko::Array vals2; + gko::Array dvals2; + gko::Array expected_float; + gko::Array expected_double; + gko::Array> cvals; + gko::Array> dcvals; +}; + + +TEST_F(PrecisionConversion, ConvertsReal) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = dvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsRealViaRef) +{ + gko::Array tmp{ref}; + gko::Array dout; + + tmp = dvals; + dout = tmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsComplex) +{ + gko::Array> dtmp; + gko::Array> dout; + + dtmp = dcvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dcvals, dout); +} + + +TEST_F(PrecisionConversion, ConversionRounds) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = dvals2; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dtmp, expected_float); + GKO_ASSERT_ARRAY_EQ(dout, expected_double); +} + + +TEST_F(PrecisionConversion, ConvertsRealFromRef) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = vals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsComplexFromRef) +{ + gko::Array> dtmp; + gko::Array> dout; + + dtmp = cvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dcvals, dout); +} + + +} // namespace diff --git a/hip/test/components/prefix_sum.hip.cpp b/hip/test/components/prefix_sum.hip.cpp new file mode 100644 index 00000000000..96f91522d06 --- /dev/null +++ b/hip/test/components/prefix_sum.hip.cpp @@ -0,0 +1,95 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class PrefixSum : public ::testing::Test { +protected: + using index_type = gko::int32; + PrefixSum() + : ref(gko::ReferenceExecutor::create()), + exec(gko::HipExecutor::create(0, ref)), + rand(293), + total_size(42793), + vals(ref, total_size), + dvals(exec) + { + std::uniform_int_distribution dist(0, 1000); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + } + dvals = vals; + } + + void test(gko::size_type size) + { + gko::kernels::reference::components::prefix_sum(ref, vals.get_data(), + size); + gko::kernels::hip::components::prefix_sum(exec, dvals.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(vals, dvals); + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + + +TEST_F(PrefixSum, SmallEqualsReference) { test(100); } + + +TEST_F(PrefixSum, BigEqualsReference) { test(total_size); } + + +} // namespace diff --git a/hip/test/components/searching_kernels.hip.cpp b/hip/test/components/searching_kernels.hip.cpp new file mode 100644 index 00000000000..e55855e40c3 --- /dev/null +++ b/hip/test/components/searching_kernels.hip.cpp @@ -0,0 +1,253 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// TODO remove when the HIP includes are fixed +#include +// force-top: off + + +#include "hip/components/searching.hip.hpp" + + +#include +#include +#include + + +#include + + +#include +#include + + +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +using namespace gko::kernels::hip; +using namespace gko::kernels::hip::group; + + +class Searching : public ::testing::Test { +protected: + Searching() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + result(ref, 1), + dresult(hip), + sizes(14203) + { + std::iota(sizes.begin(), sizes.end(), 0); + } + + template + void run_test(Kernel kernel, int offset, int size, unsigned num_blocks = 1) + { + *result.get_data() = true; + dresult = result; + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks), + dim3(config::warp_size), 0, 0, dresult.get_data(), + offset, size); + result = dresult; + auto success = *result.get_const_data(); + + ASSERT_TRUE(success); + } + + std::shared_ptr ref; + std::shared_ptr hip; + gko::Array result; + gko::Array dresult; + std::vector sizes; +}; + + +__device__ void test_assert(bool *success, bool predicate) +{ + if (!predicate) { + *success = false; + } +} + + +__global__ void test_binary_search(bool *success, int offset, int size) +{ + // test binary search on [offset, offset + size) + // for all possible partition points + auto result = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset; + }); + auto result2 = binary_search(offset, size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= threadIdx.x + offset + 1; + }); + test_assert(success, result == threadIdx.x + offset); + test_assert(success, result2 == threadIdx.x + offset + 1); +} + +TEST_F(Searching, BinaryNoOffset) +{ + run_test(test_binary_search, 0, config::warp_size); +} + +TEST_F(Searching, BinaryOffset) +{ + run_test(test_binary_search, 5, config::warp_size); +} + + +__global__ void test_empty_binary_search(bool *success, int offset, int) +{ + auto result = binary_search(offset, 0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == offset); +} + +TEST_F(Searching, BinaryEmptyNoOffset) +{ + run_test(test_empty_binary_search, 0, 0); +} + +TEST_F(Searching, BinaryEmptyOffset) +{ + run_test(test_empty_binary_search, 5, 0); +} + + +__global__ void test_sync_binary_search(bool *success, int, int size) +{ + // test binary search on [0, size) + // for all possible partition points + auto result = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x; + }); + auto result2 = synchronous_binary_search(size, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= 0 && i < size); + return i >= threadIdx.x + 1; + }); + test_assert(success, result == threadIdx.x); + test_assert(success, result2 == threadIdx.x + 1); +} + +TEST_F(Searching, SyncBinary) +{ + run_test(test_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_empty_sync_binary_search(bool *success, int, int) +{ + auto result = synchronous_binary_search(0, [&](int i) { + // don't access out-of-bounds! + test_assert(success, false); + return false; + }); + test_assert(success, result == 0); +} + +TEST_F(Searching, EmptySyncBinary) +{ + run_test(test_empty_sync_binary_search, 0, config::warp_size); +} + + +__global__ void test_warp_ary_search(bool *success, int offset, int size) +{ + // test binary search on [offset, offset + size) + // for all possible partition points + auto warp = tiled_partition(this_thread_block()); + auto result = group_ary_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpAryNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpAryOffset) +{ + for (auto size : sizes) { + run_test(test_warp_ary_search, 134, size, size + 1); + } +} + + +__global__ void test_warp_wide_search(bool *success, int offset, int size) +{ + // test binary search on [offset, offset + size) + // for all possible partition points + auto warp = tiled_partition(this_thread_block()); + auto result = group_wide_search(offset, size, warp, [&](int i) { + // don't access out-of-bounds! + test_assert(success, i >= offset && i < offset + size); + return i >= blockIdx.x + offset; + }); + test_assert(success, result == blockIdx.x + offset); +} + +TEST_F(Searching, WarpWideNoOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 0, size, size + 1); + } +} + +TEST_F(Searching, WarpWideOffset) +{ + for (auto size : sizes) { + run_test(test_warp_wide_search, 142, size, size + 1); + } +} + + +} // namespace diff --git a/hip/test/components/sorting_kernels.hip.cpp b/hip/test/components/sorting_kernels.hip.cpp new file mode 100644 index 00000000000..ca30186096c --- /dev/null +++ b/hip/test/components/sorting_kernels.hip.cpp @@ -0,0 +1,146 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "hip/components/sorting.hip.hpp" + + +#include +#include + + +#include + + +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +using gko::kernels::hip::bitonic_sort; +using gko::kernels::hip::config; + + +constexpr auto num_elements = 2048; +constexpr auto num_local = 4; +constexpr auto num_threads = num_elements / num_local; + + +__global__ void test_sort_shared(gko::int32 *data) +{ + gko::int32 local[num_local]; + __shared__ gko::int32 sh_local[num_elements]; + for (int i = 0; i < num_local; ++i) { + local[i] = data[threadIdx.x * num_local + i]; + } + bitonic_sort(local, sh_local); + for (int i = 0; i < num_local; ++i) { + data[threadIdx.x * num_local + i] = local[i]; + } +} + + +__global__ void test_sort_warp(gko::int32 *data) +{ + gko::int32 local[num_local]; + for (int i = 0; i < num_local; ++i) { + local[i] = data[threadIdx.x * num_local + i]; + } + bitonic_sort( + local, static_cast(nullptr)); + for (int i = 0; i < num_local; ++i) { + data[threadIdx.x * num_local + i] = local[i]; + } +} + + +class Sorting : public ::testing::Test { +protected: + Sorting() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + rng(123456), + ref_shared(ref, num_elements), + ref_warp(ref), + ddata(hip) + { + // we want some duplicate elements + std::uniform_int_distribution dist(0, num_elements / 2); + for (auto i = 0; i < num_elements; ++i) { + ref_shared.get_data()[i] = dist(rng); + } + ddata = gko::Array{hip, ref_shared}; + ref_warp = ref_shared; + std::sort(ref_shared.get_data(), ref_shared.get_data() + num_elements); + std::sort(ref_warp.get_data(), + ref_warp.get_data() + (config::warp_size * num_local)); + } + + std::shared_ptr ref; + std::shared_ptr hip; + std::default_random_engine rng; + gko::Array ref_shared; + gko::Array ref_warp; + gko::Array ddata; +}; + + +TEST_F(Sorting, HipBitonicSortWarp) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_warp), dim3(1), + dim3(config::warp_size), 0, 0, ddata.get_data()); + ddata.set_executor(ref); + auto data_ptr = ddata.get_const_data(); + auto ref_ptr = ref_warp.get_const_data(); + + ASSERT_TRUE(std::equal(data_ptr, data_ptr + (num_local * config::warp_size), + ref_ptr)); +} + + +TEST_F(Sorting, HipBitonicSortShared) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_shared), dim3(1), + dim3(num_threads), 0, 0, ddata.get_data()); + ddata.set_executor(ref); + auto data_ptr = ddata.get_const_data(); + auto ref_ptr = ref_shared.get_const_data(); + + ASSERT_TRUE(std::equal(data_ptr, data_ptr + num_elements, ref_ptr)); +} + + +} // namespace diff --git a/hip/test/factorization/CMakeLists.txt b/hip/test/factorization/CMakeLists.txt new file mode 100644 index 00000000000..da6c40ca680 --- /dev/null +++ b/hip/test/factorization/CMakeLists.txt @@ -0,0 +1,4 @@ +ginkgo_create_hip_test_special_linkage(ilu_kernels) +ginkgo_create_hip_test(par_ict_kernels) +ginkgo_create_hip_test(par_ilu_kernels) +ginkgo_create_hip_test(par_ilut_kernels) diff --git a/hip/test/factorization/ilu_kernels.cpp b/hip/test/factorization/ilu_kernels.cpp new file mode 100644 index 00000000000..b0bffcdd430 --- /dev/null +++ b/hip/test/factorization/ilu_kernels.cpp @@ -0,0 +1,121 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include + + +#include + + +#include +#include + + +#include "hip/test/utils.hip.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class Ilu : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Csr = gko::matrix::Csr; + + std::shared_ptr ref; + std::shared_ptr hip; + std::shared_ptr csr_ref; + std::shared_ptr csr_hip; + + Ilu() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)) + {} + + void SetUp() override + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + csr_ref = gko::read(input_file, ref); + csr_hip = Csr::create(hip); + csr_hip->copy_from(gko::lend(csr_ref)); + } +}; + + +TEST_F(Ilu, ComputeILUIsEquivalentToRef) +{ + auto ref_fact = + gko::factorization::ParIlu<>::build().on(ref)->generate(csr_ref); + auto hip_fact = + gko::factorization::Ilu<>::build().on(hip)->generate(csr_hip); + + GKO_ASSERT_MTX_NEAR(ref_fact->get_l_factor(), hip_fact->get_l_factor(), + 1e-14); + GKO_ASSERT_MTX_NEAR(ref_fact->get_u_factor(), hip_fact->get_u_factor(), + 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_l_factor(), + hip_fact->get_l_factor()); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_fact->get_u_factor(), + hip_fact->get_u_factor()); +} + + +TEST_F(Ilu, SetsCorrectStrategy) +{ + auto hip_fact = + gko::factorization::Ilu<>::build() + .with_l_strategy(std::make_shared()) + .with_u_strategy(std::make_shared(hip)) + .on(hip) + ->generate(csr_hip); + + ASSERT_EQ(hip_fact->get_l_factor()->get_strategy()->get_name(), + "merge_path"); + ASSERT_EQ(hip_fact->get_u_factor()->get_strategy()->get_name(), + "load_balance"); +} + + +} // namespace diff --git a/hip/test/factorization/par_ict_kernels.hip.cpp b/hip/test/factorization/par_ict_kernels.hip.cpp new file mode 100644 index 00000000000..b8858dadaa4 --- /dev/null +++ b/hip/test/factorization/par_ict_kernels.hip.cpp @@ -0,0 +1,177 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "hip/test/utils.hip.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class ParIct : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + + ParIct() + : mtx_size(500, 500), + rand_engine(6780), + ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)) + { + mtx = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution<>(10, mtx_size[1]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + + dmtx_ani = Csr::create(hip); + dmtx_l_ani = Csr::create(hip); + dmtx = Csr::create(hip); + dmtx->copy_from(lend(mtx)); + dmtx_l = Csr::create(hip); + dmtx_l->copy_from(lend(mtx_l)); + } + + void SetUp() + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + mtx_ani = gko::read(input_file, ref); + mtx_ani->sort_by_column_index(); + + { + mtx_l_ani = Csr::create(ref, mtx_ani->get_size()); + gko::matrix::CsrBuilder l_builder( + lend(mtx_l_ani)); + gko::kernels::reference::factorization::initialize_row_ptrs_l( + ref, lend(mtx_ani), mtx_l_ani->get_row_ptrs()); + auto l_nnz = + mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + gko::kernels::reference::factorization::initialize_l( + ref, lend(mtx_ani), lend(mtx_l_ani), true); + } + dmtx_ani->copy_from(lend(mtx_ani)); + dmtx_l_ani->copy_from(lend(mtx_l_ani)); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + const gko::dim<2> mtx_size; + std::default_random_engine rand_engine; + + std::unique_ptr mtx; + std::unique_ptr mtx_ani; + std::unique_ptr mtx_l_ani; + std::unique_ptr mtx_l; + + std::unique_ptr dmtx; + std::unique_ptr dmtx_ani; + std::unique_ptr dmtx_l_ani; + std::unique_ptr dmtx_l; +}; + + +TEST_F(ParIct, KernelAddCandidatesIsEquivalentToRef) +{ + auto mtx_llt = Csr::create(ref, mtx_size); + mtx_l->apply(lend(mtx_l->transpose()), lend(mtx_llt)); + auto dmtx_llt = Csr::create(hip, mtx_size); + dmtx_llt->copy_from(lend(mtx_llt)); + auto res_mtx_l = Csr::create(ref, mtx_size); + auto dres_mtx_l = Csr::create(hip, mtx_size); + + gko::kernels::reference::par_ict_factorization::add_candidates( + ref, lend(mtx_llt), lend(mtx), lend(mtx_l), lend(res_mtx_l)); + gko::kernels::hip::par_ict_factorization::add_candidates( + hip, lend(dmtx_llt), lend(dmtx), lend(dmtx_l), lend(dres_mtx_l)); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l); + GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14); +} + + +TEST_F(ParIct, KernelComputeFactorIsEquivalentToRef) +{ + auto square_size = mtx_ani->get_size(); + auto mtx_l_coo = Coo::create(ref, square_size); + mtx_l_ani->convert_to(lend(mtx_l_coo)); + auto dmtx_l_coo = Coo::create(hip, square_size); + dmtx_l_coo->copy_from(lend(mtx_l_coo)); + + gko::kernels::reference::par_ict_factorization::compute_factor( + ref, lend(mtx_ani), lend(mtx_l_ani), lend(mtx_l_coo)); + for (int i = 0; i < 20; ++i) { + gko::kernels::hip::par_ict_factorization::compute_factor( + hip, lend(dmtx_ani), lend(dmtx_l_ani), lend(dmtx_l_coo)); + } + + GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2); +} + + +} // namespace diff --git a/hip/test/factorization/par_ilu_kernels.hip.cpp b/hip/test/factorization/par_ilu_kernels.hip.cpp new file mode 100644 index 00000000000..96dffed19e1 --- /dev/null +++ b/hip/test/factorization/par_ilu_kernels.hip.cpp @@ -0,0 +1,349 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilu_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "hip/test/utils.hip.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class ParIlu : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Dense = gko::matrix::Dense; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + + std::ranlux48 rand_engine; + std::shared_ptr ref; + std::shared_ptr hip; + std::shared_ptr csr_ref; + std::shared_ptr csr_hip; + + ParIlu() + : rand_engine(19), + ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)), + csr_ref(nullptr), + csr_hip(nullptr) + {} + + void SetUp() override + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + auto csr_ref_temp = gko::read(input_file, ref); + auto csr_hip_temp = Csr::create(hip); + csr_hip_temp->copy_from(gko::lend(csr_ref_temp)); + // Make sure there are diagonal elements present + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(csr_ref_temp), false); + gko::kernels::hip::factorization::add_diagonal_elements( + hip, gko::lend(csr_hip_temp), false); + csr_ref = gko::give(csr_ref_temp); + csr_hip = gko::give(csr_hip_temp); + } + + template + std::unique_ptr gen_mtx(index_type num_rows, index_type num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution(0, num_cols - 1), + std::normal_distribution(0.0, 1.0), rand_engine, ref); + } + + std::unique_ptr gen_unsorted_mtx(index_type num_rows, + index_type num_cols) + { + using std::swap; + auto mtx = gen_mtx(num_rows, num_cols); + auto values = mtx->get_values(); + auto col_idxs = mtx->get_col_idxs(); + const auto row_ptrs = mtx->get_const_row_ptrs(); + for (int row = 0; row < num_rows; ++row) { + const auto row_start = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const int num_row_elements = row_end - row_start; + auto idx_dist = std::uniform_int_distribution( + row_start, row_end - 1); + for (int i = 0; i < num_row_elements / 2; ++i) { + auto idx1 = idx_dist(rand_engine); + auto idx2 = idx_dist(rand_engine); + if (idx1 != idx2) { + swap(values[idx1], values[idx2]); + swap(col_idxs[idx1], col_idxs[idx2]); + } + } + } + return mtx; + } + + void initialize_row_ptrs(index_type *l_row_ptrs_ref, + index_type *u_row_ptrs_ref, + index_type *l_row_ptrs_hip, + index_type *u_row_ptrs_hip) + { + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( + ref, gko::lend(csr_ref), l_row_ptrs_ref, u_row_ptrs_ref); + gko::kernels::hip::factorization::initialize_row_ptrs_l_u( + hip, gko::lend(csr_hip), l_row_ptrs_hip, u_row_ptrs_hip); + } + + void initialize_lu(std::unique_ptr *l_ref, std::unique_ptr *u_ref, + std::unique_ptr *l_hip, std::unique_ptr *u_hip) + { + auto num_row_ptrs = csr_ref->get_size()[0] + 1; + gko::Array l_row_ptrs_ref{ref, num_row_ptrs}; + gko::Array u_row_ptrs_ref{ref, num_row_ptrs}; + gko::Array l_row_ptrs_hip{hip, num_row_ptrs}; + gko::Array u_row_ptrs_hip{hip, num_row_ptrs}; + + initialize_row_ptrs( + l_row_ptrs_ref.get_data(), u_row_ptrs_ref.get_data(), + l_row_ptrs_hip.get_data(), u_row_ptrs_hip.get_data()); + // Since `initialize_row_ptrs` was already tested, it is expected that + // `*_ref` and `*_hip` contain identical values + auto l_nnz = l_row_ptrs_ref.get_const_data()[num_row_ptrs - 1]; + auto u_nnz = u_row_ptrs_ref.get_const_data()[num_row_ptrs - 1]; + + *l_ref = Csr::create(ref, csr_ref->get_size(), l_nnz); + *u_ref = Csr::create(ref, csr_ref->get_size(), u_nnz); + *l_hip = Csr::create(hip, csr_hip->get_size(), l_nnz); + *u_hip = Csr::create(hip, csr_hip->get_size(), u_nnz); + // Copy the already initialized `row_ptrs` to the new matrices + ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(), + (*l_ref)->get_row_ptrs()); + ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(), + (*u_ref)->get_row_ptrs()); + hip->copy(num_row_ptrs, l_row_ptrs_hip.get_data(), + (*l_hip)->get_row_ptrs()); + hip->copy(num_row_ptrs, u_row_ptrs_hip.get_data(), + (*u_hip)->get_row_ptrs()); + + gko::kernels::reference::factorization::initialize_l_u( + ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref)); + gko::kernels::hip::factorization::initialize_l_u( + hip, gko::lend(csr_hip), gko::lend(*l_hip), gko::lend(*u_hip)); + } + + template + static std::unique_ptr static_unique_ptr_cast( + std::unique_ptr &&from) + { + return std::unique_ptr{static_cast(from.release())}; + } + + void compute_lu(std::unique_ptr *l_ref, std::unique_ptr *u_ref, + std::unique_ptr *l_hip, std::unique_ptr *u_hip, + gko::size_type iterations = 0) + { + auto coo_ref = Coo::create(ref); + csr_ref->convert_to(gko::lend(coo_ref)); + auto coo_hip = Coo::create(hip); + csr_hip->convert_to(gko::lend(coo_hip)); + initialize_lu(l_ref, u_ref, l_hip, u_hip); + auto u_transpose_lin_op_ref = (*u_ref)->transpose(); + auto u_transpose_csr_ref = + static_unique_ptr_cast(std::move(u_transpose_lin_op_ref)); + auto u_transpose_lin_op_hip = (*u_hip)->transpose(); + auto u_transpose_csr_hip = + static_unique_ptr_cast(std::move(u_transpose_lin_op_hip)); + + gko::kernels::reference::par_ilu_factorization::compute_l_u_factors( + ref, iterations, gko::lend(coo_ref), gko::lend(*l_ref), + gko::lend(u_transpose_csr_ref)); + gko::kernels::hip::par_ilu_factorization::compute_l_u_factors( + hip, iterations, gko::lend(coo_hip), gko::lend(*l_hip), + gko::lend(u_transpose_csr_hip)); + auto u_lin_op_ref = u_transpose_csr_ref->transpose(); + *u_ref = static_unique_ptr_cast(std::move(u_lin_op_ref)); + auto u_lin_op_hip = u_transpose_csr_hip->transpose(); + *u_hip = static_unique_ptr_cast(std::move(u_lin_op_hip)); + } +}; + + +TEST_F(ParIlu, HipKernelAddDiagonalElementsSortedEquivalentToRef) +{ + index_type num_rows{600}; + index_type num_cols{600}; + auto mtx_ref = gen_mtx(num_rows, num_cols); + auto mtx_hip = Csr::create(hip); + mtx_hip->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(mtx_ref), true); + gko::kernels::hip::factorization::add_diagonal_elements( + hip, gko::lend(mtx_hip), true); + hip->synchronize(); + + ASSERT_TRUE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_hip, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_hip); +} + + +TEST_F(ParIlu, HipKernelAddDiagonalElementsUnsortedEquivalentToRef) +{ + index_type num_rows{600}; + index_type num_cols{600}; + auto mtx_ref = gen_unsorted_mtx(num_rows, num_cols); + auto mtx_hip = Csr::create(hip); + mtx_hip->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(mtx_ref), false); + gko::kernels::hip::factorization::add_diagonal_elements( + hip, gko::lend(mtx_hip), false); + hip->synchronize(); + + ASSERT_FALSE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_hip, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_hip); +} + + +TEST_F(ParIlu, HipKernelAddDiagonalElementsNonSquareEquivalentToRef) +{ + index_type num_rows{600}; + index_type num_cols{500}; + auto mtx_ref = gen_mtx(num_rows, num_cols); + auto mtx_hip = Csr::create(hip); + mtx_hip->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(mtx_ref), true); + gko::kernels::hip::factorization::add_diagonal_elements( + hip, gko::lend(mtx_hip), true); + hip->synchronize(); + + ASSERT_TRUE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_hip, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_hip); +} + + +TEST_F(ParIlu, KernelInitializeRowPtrsLUEquivalentToRef) +{ + auto num_row_ptrs = csr_ref->get_size()[0] + 1; + gko::Array l_row_ptrs_array_ref(ref, num_row_ptrs); + gko::Array u_row_ptrs_array_ref(ref, num_row_ptrs); + gko::Array l_row_ptrs_array_hip(hip, num_row_ptrs); + gko::Array u_row_ptrs_array_hip(hip, num_row_ptrs); + + initialize_row_ptrs( + l_row_ptrs_array_ref.get_data(), u_row_ptrs_array_ref.get_data(), + l_row_ptrs_array_hip.get_data(), u_row_ptrs_array_hip.get_data()); + + GKO_ASSERT_ARRAY_EQ(l_row_ptrs_array_ref, l_row_ptrs_array_hip); + GKO_ASSERT_ARRAY_EQ(u_row_ptrs_array_ref, u_row_ptrs_array_hip); +} + + +TEST_F(ParIlu, KernelInitializeParILUIsEquivalentToRef) +{ + std::unique_ptr l_ref{}; + std::unique_ptr u_ref{}; + std::unique_ptr l_hip{}; + std::unique_ptr u_hip{}; + + initialize_lu(&l_ref, &u_ref, &l_hip, &u_hip); + + GKO_ASSERT_MTX_NEAR(l_ref, l_hip, 1e-14); + GKO_ASSERT_MTX_NEAR(u_ref, u_hip, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_hip); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_hip); +} + + +TEST_F(ParIlu, KernelComputeParILUIsEquivalentToRef) +{ + std::unique_ptr l_ref{}; + std::unique_ptr u_ref{}; + std::unique_ptr l_hip{}; + std::unique_ptr u_hip{}; + + compute_lu(&l_ref, &u_ref, &l_hip, &u_hip); + + GKO_ASSERT_MTX_NEAR(l_ref, l_hip, 5e-2); + GKO_ASSERT_MTX_NEAR(u_ref, u_hip, 5e-2); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_hip); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_hip); +} + + +TEST_F(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) +{ + std::unique_ptr l_ref{}; + std::unique_ptr u_ref{}; + std::unique_ptr l_hip{}; + std::unique_ptr u_hip{}; + gko::size_type iterations{200}; + + compute_lu(&l_ref, &u_ref, &l_hip, &u_hip, iterations); + + GKO_ASSERT_MTX_NEAR(l_ref, l_hip, 1e-14); + GKO_ASSERT_MTX_NEAR(u_ref, u_hip, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_hip); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_hip); +} + + +} // namespace diff --git a/hip/test/factorization/par_ilut_kernels.hip.cpp b/hip/test/factorization/par_ilut_kernels.hip.cpp new file mode 100644 index 00000000000..38fb5eb205f --- /dev/null +++ b/hip/test/factorization/par_ilut_kernels.hip.cpp @@ -0,0 +1,547 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/factorization/par_ilu_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "hip/test/utils.hip.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class ParIlut : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Dense = gko::matrix::Dense; + using ComplexDense = gko::matrix::Dense>; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + using ComplexCsr = gko::matrix::Csr, index_type>; + + ParIlut() + : mtx_size(500, 700), + rand_engine(1337), + ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)) + { + mtx1 = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution<>(10, mtx_size[1]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx2 = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution<>(0, mtx_size[1]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_square = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[0], + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l2 = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], true, + std::uniform_int_distribution<>(1, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_l_complex = + gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_u = gko::test::generate_random_upper_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + mtx_u_complex = + gko::test::generate_random_upper_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution<>(10, mtx_size[0]), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + + + dmtx1 = Csr::create(hip); + dmtx1->copy_from(mtx1.get()); + dmtx2 = Csr::create(hip); + dmtx2->copy_from(mtx2.get()); + dmtx_square = Csr::create(hip); + dmtx_square->copy_from(mtx_square.get()); + dmtx_ani = Csr::create(hip); + dmtx_l_ani = Csr::create(hip); + dmtx_u_ani = Csr::create(hip); + dmtx_ut_ani = Csr::create(hip); + dmtx_l = Csr::create(hip); + dmtx_l->copy_from(mtx_l.get()); + dmtx_l2 = Csr::create(hip); + dmtx_l2->copy_from(mtx_l2.get()); + dmtx_u = Csr::create(hip); + dmtx_u->copy_from(mtx_u.get()); + dmtx_l_complex = ComplexCsr::create(hip); + dmtx_l_complex->copy_from(mtx_l_complex.get()); + dmtx_u_complex = ComplexCsr::create(hip); + dmtx_u_complex->copy_from(mtx_u_complex.get()); + } + + void SetUp() + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + mtx_ani = gko::read(input_file, ref); + mtx_ani->sort_by_column_index(); + + { + mtx_l_ani = Csr::create(ref, mtx_ani->get_size()); + mtx_u_ani = Csr::create(ref, mtx_ani->get_size()); + gko::matrix::CsrBuilder l_builder( + mtx_l_ani.get()); + gko::matrix::CsrBuilder u_builder( + mtx_u_ani.get()); + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( + ref, mtx_ani.get(), mtx_l_ani->get_row_ptrs(), + mtx_u_ani->get_row_ptrs()); + auto l_nnz = + mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + auto u_nnz = + mtx_u_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + u_builder.get_col_idx_array().resize_and_reset(u_nnz); + u_builder.get_value_array().resize_and_reset(u_nnz); + gko::kernels::reference::factorization::initialize_l_u( + ref, mtx_ani.get(), mtx_l_ani.get(), mtx_u_ani.get()); + mtx_ut_ani = Csr::create(ref, mtx_ani->get_size(), + mtx_u_ani->get_num_stored_elements()); + gko::kernels::reference::csr::transpose(ref, mtx_u_ani.get(), + mtx_ut_ani.get()); + } + dmtx_ani->copy_from(mtx_ani.get()); + dmtx_l_ani->copy_from(mtx_l_ani.get()); + dmtx_u_ani->copy_from(mtx_u_ani.get()); + dmtx_ut_ani->copy_from(mtx_ut_ani.get()); + } + + template + void test_select(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, index_type rank, + value_type tolerance = 0.0) + { + auto size = index_type(mtx->get_num_stored_elements()); + using ValueType = typename Mtx::value_type; + + gko::remove_complex res{}; + gko::remove_complex dres{}; + gko::Array tmp(ref); + gko::Array> tmp2(ref); + gko::Array dtmp(hip); + gko::Array> dtmp2(hip); + + gko::kernels::reference::par_ilut_factorization::threshold_select( + ref, mtx.get(), rank, tmp, tmp2, res); + gko::kernels::hip::par_ilut_factorization::threshold_select( + hip, dmtx.get(), rank, dtmp, dtmp2, dres); + + ASSERT_NEAR(res, dres, tolerance); + } + + template > + void test_filter(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, value_type threshold, + bool lower) + { + auto res = Mtx::create(ref, mtx_size); + auto dres = Mtx::create(hip, mtx_size); + auto res_coo = Coo::create(ref, mtx_size); + auto dres_coo = Coo::create(hip, mtx_size); + auto local_mtx = gko::as(lower ? mtx->clone() : mtx->transpose()); + auto local_dmtx = + gko::as(lower ? dmtx->clone() : dmtx->transpose()); + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower); + gko::kernels::hip::par_ilut_factorization::threshold_filter( + hip, local_dmtx.get(), threshold, dres.get(), dres_coo.get(), + lower); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + GKO_ASSERT_MTX_NEAR(res, res_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo); + GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo); + } + + template > + void test_filter_approx(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, index_type rank, + value_type tolerance = 0.0) + { + auto res = Mtx::create(ref, mtx_size); + auto dres = Mtx::create(hip, mtx_size); + auto res_coo = Coo::create(ref, mtx_size); + auto dres_coo = Coo::create(hip, mtx_size); + using ValueType = typename Mtx::value_type; + + gko::Array tmp(ref); + gko::Array dtmp(hip); + gko::remove_complex threshold{}; + gko::remove_complex dthreshold{}; + + gko::kernels::reference::par_ilut_factorization:: + threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold, + res.get(), res_coo.get()); + gko::kernels::hip::par_ilut_factorization::threshold_filter_approx( + hip, dmtx.get(), rank, dtmp, dthreshold, dres.get(), + dres_coo.get()); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + GKO_ASSERT_MTX_NEAR(res, res_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo); + GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo); + ASSERT_NEAR(threshold, dthreshold, tolerance); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + const gko::dim<2> mtx_size; + std::default_random_engine rand_engine; + + std::unique_ptr mtx1; + std::unique_ptr mtx2; + std::unique_ptr mtx_square; + std::unique_ptr mtx_ani; + std::unique_ptr mtx_l_ani; + std::unique_ptr mtx_u_ani; + std::unique_ptr mtx_ut_ani; + std::unique_ptr mtx_l; + std::unique_ptr mtx_l2; + std::unique_ptr mtx_l_complex; + std::unique_ptr mtx_u; + std::unique_ptr mtx_u_complex; + + std::unique_ptr dmtx1; + std::unique_ptr dmtx2; + std::unique_ptr dmtx_square; + std::unique_ptr dmtx_ani; + std::unique_ptr dmtx_l_ani; + std::unique_ptr dmtx_u_ani; + std::unique_ptr dmtx_ut_ani; + std::unique_ptr dmtx_l; + std::unique_ptr dmtx_l2; + std::unique_ptr dmtx_l_complex; + std::unique_ptr dmtx_u; + std::unique_ptr dmtx_u_complex; +}; + + +TEST_F(ParIlut, KernelThresholdSelectIsEquivalentToRef) +{ + test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 3); +} + + +TEST_F(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) +{ + test_select(mtx_l, dmtx_l, 0); +} + + +TEST_F(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef) +{ + test_select(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1); +} + + +TEST_F(ParIlut, KernelComplexThresholdSelectIsEquivalentToRef) +{ + test_select(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() / 3, 1e-14); +} + + +TEST_F(ParIlut, KernelComplexThresholdSelectMinIsEquivalentToRef) +{ + test_select(mtx_l_complex, dmtx_l_complex, 0, 1e-14); +} + + +TEST_F(ParIlut, KernelComplexThresholdSelectMaxLowerIsEquivalentToRef) +{ + test_select(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() - 1, 1e-14); +} + + +TEST_F(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef) +{ + auto res = Csr::create(ref, mtx_size); + auto dres = Csr::create(hip, mtx_size); + Coo *null_coo = nullptr; + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, mtx_l.get(), 0.5, res.get(), null_coo, true); + gko::kernels::hip::par_ilut_factorization::threshold_filter( + hip, dmtx_l.get(), 0.5, dres.get(), null_coo, true); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); +} + + +TEST_F(ParIlut, KernelThresholdFilterLowerIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0.5, true); +} + + +TEST_F(ParIlut, KernelThresholdFilterUpperIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0.5, false); +} + + +TEST_F(ParIlut, KernelThresholdFilterNoneLowerIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0, true); +} + + +TEST_F(ParIlut, KernelThresholdFilterNoneUpperIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0, false); +} + + +TEST_F(ParIlut, KernelThresholdFilterAllLowerIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 1e6, true); +} + + +TEST_F(ParIlut, KernelThresholdFilterAllUpperIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 1e6, false); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterLowerIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0.5, true); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterNoneLowerIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0, true); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterAllLowerIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 1e6, true); +} + + +#if defined(hipsparseVersionMajor) && defined(hipsparseVersionMinor) && \ + ((hipsparseVersionMajor > 1) || \ + (hipsparseVersionMajor == 1 && hipsparseVersionMinor >= 4)) +TEST_F(ParIlut, KernelComplexThresholdFilterUpperIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0.5, false); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterNoneUpperIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 0, false); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterAllUppererIsEquivalentToRef) +{ + test_filter(mtx_l_complex, dmtx_l_complex, 1e6, false); +} +#endif // hipsparse version >= 1.4 + + +TEST_F(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) +{ + test_filter(mtx_l, dmtx_l, 0.5, true); + auto res = Csr::create(ref, mtx_size); + auto dres = Csr::create(hip, mtx_size); + Coo *null_coo = nullptr; + gko::Array tmp(ref); + gko::Array dtmp(hip); + gko::remove_complex threshold{}; + gko::remove_complex dthreshold{}; + index_type rank{}; + + gko::kernels::reference::par_ilut_factorization::threshold_filter_approx( + ref, mtx_l.get(), rank, tmp, threshold, res.get(), null_coo); + gko::kernels::hip::par_ilut_factorization::threshold_filter_approx( + hip, dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), null_coo); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + ASSERT_EQ(threshold, dthreshold); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() / 2); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l, dmtx_l, 0); +} + + +TEST_F(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l, dmtx_l, mtx_l->get_num_stored_elements() - 1); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterApproxLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() / 2, + r::value); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterApproxNoneLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l_complex, dmtx_l_complex, 0, r::value); +} + + +TEST_F(ParIlut, KernelComplexThresholdFilterApproxAllLowerIsEquivalentToRef) +{ + test_filter_approx(mtx_l_complex, dmtx_l_complex, + mtx_l_complex->get_num_stored_elements() - 1, + r::value); +} + + +TEST_F(ParIlut, KernelAddCandidatesIsEquivalentToRef) +{ + auto square_size = mtx_square->get_size(); + auto mtx_lu = Csr::create(ref, square_size); + mtx_l2->apply(mtx_u.get(), mtx_lu.get()); + auto dmtx_lu = Csr::create(hip, square_size); + dmtx_lu->copy_from(mtx_lu.get()); + auto res_mtx_l = Csr::create(ref, square_size); + auto res_mtx_u = Csr::create(ref, square_size); + auto dres_mtx_l = Csr::create(hip, square_size); + auto dres_mtx_u = Csr::create(hip, square_size); + + gko::kernels::reference::par_ilut_factorization::add_candidates( + ref, mtx_lu.get(), mtx_square.get(), mtx_l2.get(), mtx_u.get(), + res_mtx_l.get(), res_mtx_u.get()); + gko::kernels::hip::par_ilut_factorization::add_candidates( + hip, dmtx_lu.get(), dmtx_square.get(), dmtx_l2.get(), dmtx_u.get(), + dres_mtx_l.get(), dres_mtx_u.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, dres_mtx_u); + GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, 1e-14); + GKO_ASSERT_MTX_NEAR(res_mtx_u, dres_mtx_u, 1e-14); +} + + +TEST_F(ParIlut, KernelComputeLUIsEquivalentToRef) +{ + auto square_size = mtx_ani->get_size(); + auto mtx_l_coo = Coo::create(ref, square_size); + auto mtx_u_coo = Coo::create(ref, square_size); + mtx_l_ani->convert_to(mtx_l_coo.get()); + mtx_u_ani->convert_to(mtx_u_coo.get()); + auto dmtx_l_coo = Coo::create(hip, square_size); + auto dmtx_u_coo = Coo::create(hip, square_size); + dmtx_l_coo->copy_from(mtx_l_coo.get()); + dmtx_u_coo->copy_from(mtx_u_coo.get()); + + gko::kernels::reference::par_ilut_factorization::compute_l_u_factors( + ref, mtx_ani.get(), mtx_l_ani.get(), mtx_l_coo.get(), mtx_u_ani.get(), + mtx_u_coo.get(), mtx_ut_ani.get()); + for (int i = 0; i < 20; ++i) { + gko::kernels::hip::par_ilut_factorization::compute_l_u_factors( + hip, dmtx_ani.get(), dmtx_l_ani.get(), dmtx_l_coo.get(), + dmtx_u_ani.get(), dmtx_u_coo.get(), dmtx_ut_ani.get()); + } + auto dmtx_utt_ani = gko::as(dmtx_ut_ani->transpose()); + + GKO_ASSERT_MTX_NEAR(mtx_l_ani, dmtx_l_ani, 1e-2); + GKO_ASSERT_MTX_NEAR(mtx_u_ani, dmtx_u_ani, 1e-2); + GKO_ASSERT_MTX_NEAR(dmtx_u_ani, dmtx_utt_ani, 0); +} + + +} // namespace diff --git a/hip/test/matrix/CMakeLists.txt b/hip/test/matrix/CMakeLists.txt new file mode 100644 index 00000000000..5f8b7251566 --- /dev/null +++ b/hip/test/matrix/CMakeLists.txt @@ -0,0 +1,6 @@ +ginkgo_create_hip_test(coo_kernels) +ginkgo_create_hip_test(csr_kernels) +ginkgo_create_hip_test(dense_kernels) +ginkgo_create_hip_test(ell_kernels) +ginkgo_create_hip_test(hybrid_kernels) +ginkgo_create_hip_test(sellp_kernels) diff --git a/hip/test/matrix/coo_kernels.hip.cpp b/hip/test/matrix/coo_kernels.hip.cpp new file mode 100644 index 00000000000..aa0f5373161 --- /dev/null +++ b/hip/test/matrix/coo_kernels.hip.cpp @@ -0,0 +1,262 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/matrix/coo_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Coo : public ::testing::Test { +protected: + using Mtx = gko::matrix::Coo<>; + using Vec = gko::matrix::Dense<>; + + Coo() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, std::uniform_int_distribution<>(1, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(int num_vectors = 1) + { + mtx = Mtx::create(ref); + mtx->copy_from(gen_mtx(532, 231)); + expected = gen_mtx(532, num_vectors); + y = gen_mtx(231, num_vectors); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dmtx = Mtx::create(hip); + dmtx->copy_from(mtx.get()); + dresult = Vec::create(hip); + dresult->copy_from(expected.get()); + dy = Vec::create(hip); + dy->copy_from(y.get()); + dalpha = Vec::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(hip); + dbeta->copy_from(beta.get()); + } + + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr mtx; + std::unique_ptr expected; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + + std::unique_ptr dmtx; + std::unique_ptr dresult; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(Coo, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, SimpleApplyAddIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply2(y.get(), expected.get()); + dmtx->apply2(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, AdvancedApplyAddIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply2(alpha.get(), y.get(), expected.get()); + dmtx->apply2(dalpha.get(), dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, SimpleApplyToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, AdvancedApplyToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, SimpleApplyAddToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply2(y.get(), expected.get()); + dmtx->apply2(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, SimpleApplyAddToLargeDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(33); + + mtx->apply2(y.get(), expected.get()); + dmtx->apply2(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, AdvancedApplyAddToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply2(alpha.get(), y.get(), expected.get()); + dmtx->apply2(dalpha.get(), dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, AdvancedApplyAddToLargeDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(33); + + mtx->apply2(y.get(), expected.get()); + dmtx->apply2(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Coo, ConvertToDenseIsEquivalentToRef) +{ + set_up_apply_data(); + auto dense_mtx = gko::matrix::Dense<>::create(ref); + auto ddense_mtx = gko::matrix::Dense<>::create(hip); + + mtx->convert_to(dense_mtx.get()); + dmtx->convert_to(ddense_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); +} + + +TEST_F(Coo, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto dense_mtx = gko::matrix::Dense<>::create(ref); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + mtx->convert_to(dense_mtx.get()); + dense_mtx->convert_to(csr_mtx.get()); + dmtx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + +} // namespace diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp new file mode 100644 index 00000000000..a1b2adfd794 --- /dev/null +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -0,0 +1,701 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/csr_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Csr : public ::testing::Test { +protected: + using Mtx = gko::matrix::Csr<>; + using Vec = gko::matrix::Dense<>; + + Csr() : mtx_size(532, 231), rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + template + std::unique_ptr gen_mtx(int num_rows, int num_cols, + int min_nnz_row) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(min_nnz_row, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(std::shared_ptr strategy, + int num_vectors = 1) + { + mtx = Mtx::create(ref, strategy); + mtx->copy_from(gen_mtx(mtx_size[0], mtx_size[1], 1)); + square_mtx = Mtx::create(ref, strategy); + square_mtx->copy_from(gen_mtx(mtx_size[0], mtx_size[0], 1)); + expected = gen_mtx(mtx_size[0], num_vectors, 1); + y = gen_mtx(mtx_size[1], num_vectors, 1); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dmtx = Mtx::create(hip, strategy); + dmtx->copy_from(mtx.get()); + square_dmtx = Mtx::create(hip, strategy); + square_dmtx->copy_from(square_mtx.get()); + dresult = Vec::create(hip); + dresult->copy_from(expected.get()); + dy = Vec::create(hip); + dy->copy_from(y.get()); + dalpha = Vec::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(hip); + dbeta->copy_from(beta.get()); + } + + struct matrix_pair { + std::unique_ptr ref; + std::unique_ptr hip; + }; + + matrix_pair gen_unsorted_mtx() + { + constexpr int min_nnz_per_row = 2; // Must be at least 2 + auto local_mtx_ref = + gen_mtx(mtx_size[0], mtx_size[1], min_nnz_per_row); + for (size_t row = 0; row < mtx_size[0]; ++row) { + const auto row_ptrs = local_mtx_ref->get_const_row_ptrs(); + const auto start_row = row_ptrs[row]; + auto col_idx = local_mtx_ref->get_col_idxs() + start_row; + auto vals = local_mtx_ref->get_values() + start_row; + const auto nnz_in_this_row = row_ptrs[row + 1] - row_ptrs[row]; + auto swap_idx_dist = + std::uniform_int_distribution<>(0, nnz_in_this_row - 1); + // shuffle `nnz_in_this_row / 2` times + for (size_t perm = 0; perm < nnz_in_this_row; perm += 2) { + const auto idx1 = swap_idx_dist(rand_engine); + const auto idx2 = swap_idx_dist(rand_engine); + std::swap(col_idx[idx1], col_idx[idx2]); + std::swap(vals[idx1], vals[idx2]); + } + } + auto local_mtx_hip = Mtx::create(hip); + local_mtx_hip->copy_from(local_mtx_ref.get()); + + return {std::move(local_mtx_ref), std::move(local_mtx_hip)}; + } + + std::shared_ptr ref; + std::shared_ptr hip; + + const gko::dim<2> mtx_size; + std::ranlux48 rand_engine; + + std::unique_ptr mtx; + std::unique_ptr square_mtx; + std::unique_ptr expected; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + + std::unique_ptr dmtx; + std::unique_ptr square_dmtx; + std::unique_ptr dresult; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(Csr, StrategyAfterCopyIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared(hip)); + + ASSERT_EQ(mtx->get_strategy()->get_name(), + dmtx->get_strategy()->get_name()); +} + + +TEST_F(Csr, SimpleApplyIsEquivalentToRefWithLoadBalance) +{ + set_up_apply_data(std::make_shared(hip)); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithLoadBalance) +{ + set_up_apply_data(std::make_shared(hip)); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyIsEquivalentToRefWithHipsparse) +{ + set_up_apply_data(std::make_shared()); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithHipsparse) +{ + set_up_apply_data(std::make_shared()); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyIsEquivalentToRefWithMergePath) +{ + set_up_apply_data(std::make_shared()); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithMergePath) +{ + set_up_apply_data(std::make_shared()); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyIsEquivalentToRefWithClassical) +{ + set_up_apply_data(std::make_shared()); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyIsEquivalentToRefWithClassical) +{ + set_up_apply_data(std::make_shared()); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyIsEquivalentToRefWithAutomatical) +{ + set_up_apply_data(std::make_shared(hip)); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithLoadBalance) +{ + set_up_apply_data(std::make_shared(hip), 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithLoadBalance) +{ + set_up_apply_data(std::make_shared(hip), 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithClassical) +{ + set_up_apply_data(std::make_shared(), 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithClassical) +{ + set_up_apply_data(std::make_shared(), 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRefWithMergePath) +{ + set_up_apply_data(std::make_shared(), 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath) +{ + set_up_apply_data(std::make_shared(), 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared(hip)); + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + mtx->apply(alpha.get(), trans.get(), beta.get(), square_mtx.get()); + dmtx->apply(dalpha.get(), d_trans.get(), dbeta.get(), square_dmtx.get()); + + GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); + ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared(hip)); + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + mtx->apply(trans.get(), square_mtx.get()); + dmtx->apply(d_trans.get(), square_dmtx.get()); + + GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); + ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared(hip)); + auto a = gen_mtx(mtx_size[0], mtx_size[1], 0); + auto b = gen_mtx(mtx_size[0], mtx_size[1], 0); + auto da = Mtx::create(hip); + auto db = Mtx::create(hip); + da->copy_from(a.get()); + db->copy_from(b.get()); + auto id = gko::matrix::Identity::create(ref, mtx_size[1]); + auto did = gko::matrix::Identity::create(hip, mtx_size[1]); + + a->apply(alpha.get(), id.get(), beta.get(), b.get()); + da->apply(dalpha.get(), did.get(), dbeta.get(), db.get()); + + GKO_ASSERT_MTX_NEAR(b, db, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(b, db); + ASSERT_TRUE(db->is_sorted_by_column_index()); +} + + +TEST_F(Csr, TransposeIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared(hip)); + + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), + static_cast(trans.get()), 0.0); +} + + +TEST_F(Csr, ConvertToDenseIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto dense_mtx = gko::matrix::Dense<>::create(ref); + auto ddense_mtx = gko::matrix::Dense<>::create(hip); + + mtx->convert_to(dense_mtx.get()); + dmtx->convert_to(ddense_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); +} + + +TEST_F(Csr, MoveToDenseIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto dense_mtx = gko::matrix::Dense<>::create(ref); + auto ddense_mtx = gko::matrix::Dense<>::create(hip); + + mtx->move_to(dense_mtx.get()); + dmtx->move_to(ddense_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); +} + + +TEST_F(Csr, ConvertToEllIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto ell_mtx = gko::matrix::Ell<>::create(ref); + auto dell_mtx = gko::matrix::Ell<>::create(hip); + + mtx->convert_to(ell_mtx.get()); + dmtx->convert_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); +} + + +TEST_F(Csr, MoveToEllIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto ell_mtx = gko::matrix::Ell<>::create(ref); + auto dell_mtx = gko::matrix::Ell<>::create(hip); + + mtx->move_to(ell_mtx.get()); + dmtx->move_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); +} + + +TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip); + + mtx->convert_to(sparsity_mtx.get()); + dmtx->convert_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14); +} + + +TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip); + + mtx->move_to(sparsity_mtx.get()); + dmtx->move_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14); +} + + +TEST_F(Csr, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(hip); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + +TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + gko::size_type max_nnz_per_row; + gko::size_type dmax_nnz_per_row; + + gko::kernels::reference::csr::calculate_max_nnz_per_row(ref, mtx.get(), + &max_nnz_per_row); + gko::kernels::hip::csr::calculate_max_nnz_per_row(hip, dmtx.get(), + &dmax_nnz_per_row); + + ASSERT_EQ(max_nnz_per_row, dmax_nnz_per_row); +} + + +TEST_F(Csr, ConvertToCooIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto coo_mtx = gko::matrix::Coo<>::create(ref); + auto dcoo_mtx = gko::matrix::Coo<>::create(hip); + + mtx->convert_to(coo_mtx.get()); + dmtx->convert_to(dcoo_mtx.get()); + + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); +} + + +TEST_F(Csr, MoveToCooIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto coo_mtx = gko::matrix::Coo<>::create(ref); + auto dcoo_mtx = gko::matrix::Coo<>::create(hip); + + mtx->move_to(coo_mtx.get()); + dmtx->move_to(dcoo_mtx.get()); + + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); +} + + +TEST_F(Csr, ConvertToSellpIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto sellp_mtx = gko::matrix::Sellp<>::create(ref); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + mtx->convert_to(sellp_mtx.get()); + dmtx->convert_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14); +} + + +TEST_F(Csr, MoveToSellpIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + auto sellp_mtx = gko::matrix::Sellp<>::create(ref); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + mtx->move_to(sellp_mtx.get()); + dmtx->move_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14); +} + + +TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + gko::size_type total_cols; + gko::size_type dtotal_cols; + + gko::kernels::reference::csr::calculate_total_cols( + ref, mtx.get(), &total_cols, 2, gko::matrix::default_slice_size); + gko::kernels::hip::csr::calculate_total_cols( + hip, dmtx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); + + ASSERT_EQ(total_cols, dtotal_cols); +} + + +TEST_F(Csr, CalculatesNonzerosPerRow) +{ + set_up_apply_data(std::make_shared()); + gko::Array row_nnz(ref, mtx->get_size()[0]); + gko::Array drow_nnz(hip, dmtx->get_size()[0]); + + gko::kernels::reference::csr::calculate_nonzeros_per_row(ref, mtx.get(), + &row_nnz); + gko::kernels::hip::csr::calculate_nonzeros_per_row(hip, dmtx.get(), + &drow_nnz); + + GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); +} + + +TEST_F(Csr, ConvertToHybridIsEquivalentToRef) +{ + using Hybrid_type = gko::matrix::Hybrid<>; + set_up_apply_data(std::make_shared()); + auto hybrid_mtx = Hybrid_type::create( + ref, std::make_shared(2)); + auto dhybrid_mtx = Hybrid_type::create( + hip, std::make_shared(2)); + + mtx->convert_to(hybrid_mtx.get()); + dmtx->convert_to(dhybrid_mtx.get()); + + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); +} + + +TEST_F(Csr, MoveToHybridIsEquivalentToRef) +{ + using Hybrid_type = gko::matrix::Hybrid<>; + set_up_apply_data(std::make_shared()); + auto hybrid_mtx = Hybrid_type::create( + ref, std::make_shared(2)); + auto dhybrid_mtx = Hybrid_type::create( + hip, std::make_shared(2)); + + mtx->move_to(hybrid_mtx.get()); + dmtx->move_to(dhybrid_mtx.get()); + + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); +} + + +TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared()); + bool is_sorted_hip{}; + bool is_sorted_ref{}; + + is_sorted_ref = mtx->is_sorted_by_column_index(); + is_sorted_hip = dmtx->is_sorted_by_column_index(); + + ASSERT_EQ(is_sorted_ref, is_sorted_hip); +} + + +TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef) +{ + auto uns_mtx = gen_unsorted_mtx(); + bool is_sorted_hip{}; + bool is_sorted_ref{}; + + is_sorted_ref = uns_mtx.ref->is_sorted_by_column_index(); + is_sorted_hip = uns_mtx.hip->is_sorted_by_column_index(); + + ASSERT_EQ(is_sorted_ref, is_sorted_hip); +} + + +TEST_F(Csr, SortSortedMatrixIsEquivalentToRef) +{ + set_up_apply_data(std::make_shared(hip)); + + mtx->sort_by_column_index(); + dmtx->sort_by_column_index(); + + // Values must be unchanged, therefore, tolerance is `0` + GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0); +} + + +TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef) +{ + auto uns_mtx = gen_unsorted_mtx(); + + uns_mtx.ref->sort_by_column_index(); + uns_mtx.hip->sort_by_column_index(); + + // Values must be unchanged, therefore, tolerance is `0` + GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.hip, 0); +} + + +TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices) +{ + auto automatical = std::make_shared(hip); + auto row_len_limit = std::max(automatical->nvidia_row_len_limit, + automatical->amd_row_len_limit); + auto load_balance_mtx = Mtx::create(ref); + auto classical_mtx = Mtx::create(ref); + load_balance_mtx->copy_from( + gen_mtx(1, row_len_limit + 1000, row_len_limit + 1)); + classical_mtx->copy_from(gen_mtx(50, 50, 1)); + auto load_balance_mtx_d = Mtx::create(hip); + auto classical_mtx_d = Mtx::create(hip); + load_balance_mtx_d->copy_from(load_balance_mtx.get()); + classical_mtx_d->copy_from(classical_mtx.get()); + + load_balance_mtx_d->set_strategy(automatical); + classical_mtx_d->set_strategy(automatical); + + EXPECT_EQ("load_balance", load_balance_mtx_d->get_strategy()->get_name()); + EXPECT_EQ("classical", classical_mtx_d->get_strategy()->get_name()); + ASSERT_NE(load_balance_mtx_d->get_strategy().get(), + classical_mtx_d->get_strategy().get()); +} + + +} // namespace diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp new file mode 100644 index 00000000000..96261c4ab46 --- /dev/null +++ b/hip/test/matrix/dense_kernels.hip.cpp @@ -0,0 +1,536 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/dense_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Dense : public ::testing::Test { +protected: + using itype = int; + using vtype = double; + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + using Arr = gko::Array; + + Dense() : rand_engine(15) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + template + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void set_up_vector_data(gko::size_type num_vecs, + bool different_alpha = false) + { + x = gen_mtx(1000, num_vecs); + y = gen_mtx(1000, num_vecs); + if (different_alpha) { + alpha = gen_mtx(1, num_vecs); + } else { + alpha = gko::initialize({2.0}, ref); + } + dx = Mtx::create(hip); + dx->copy_from(x.get()); + dy = Mtx::create(hip); + dy->copy_from(y.get()); + dalpha = Mtx::create(hip); + dalpha->copy_from(alpha.get()); + expected = Mtx::create(ref, gko::dim<2>{1, num_vecs}); + dresult = Mtx::create(hip, gko::dim<2>{1, num_vecs}); + } + + void set_up_apply_data() + { + x = gen_mtx(65, 25); + y = gen_mtx(25, 35); + expected = gen_mtx(65, 35); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dx = Mtx::create(hip); + dx->copy_from(x.get()); + dy = Mtx::create(hip); + dy->copy_from(y.get()); + dresult = Mtx::create(hip); + dresult->copy_from(expected.get()); + dalpha = Mtx::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Mtx::create(hip); + dbeta->copy_from(beta.get()); + + std::vector tmp(x->get_size()[0], 0); + auto rng = std::default_random_engine{}; + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rng); + std::vector tmp2(x->get_size()[1], 0); + std::iota(tmp2.begin(), tmp2.end(), 0); + std::shuffle(tmp2.begin(), tmp2.end(), rng); + rpermute_idxs = + std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + drpermute_idxs = + std::unique_ptr(new Arr{hip, tmp.begin(), tmp.end()}); + cpermute_idxs = + std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); + dcpermute_idxs = + std::unique_ptr(new Arr{hip, tmp2.begin(), tmp2.end()}); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; + std::unique_ptr rpermute_idxs; + std::unique_ptr drpermute_idxs; + std::unique_ptr cpermute_idxs; + std::unique_ptr dcpermute_idxs; +}; + + +TEST_F(Dense, SingleVectorHipScaleIsEquivalentToRef) +{ + set_up_vector_data(1); + auto result = Mtx::create(ref); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + result->copy_from(dx.get()); + + GKO_ASSERT_MTX_NEAR(result, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorHipScaleIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorHipScaleWithDifferentAlphaIsEquivalentToRef) +{ + set_up_vector_data(20, true); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, SingleVectorHipAddScaledIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorHipAddScaledIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorHipAddScaledWithDifferentAlphaIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, SingleVectorHipComputeDotIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->compute_dot(y.get(), expected.get()); + dx->compute_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, MultipleVectorHipComputeDotIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->compute_dot(y.get(), expected.get()); + dx->compute_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, HipComputeNorm2IsEquivalentToRef) +{ + set_up_vector_data(20); + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->hip, norm_size); + + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14); +} + + +TEST_F(Dense, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(y.get(), expected.get()); + dx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, IsTransposable) +{ + set_up_apply_data(); + + auto trans = x->transpose(); + auto dtrans = dx->transpose(); + + GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), + static_cast(trans.get()), 0); +} + + +TEST_F(Dense, ConvertToCooIsEquivalentToRef) +{ + set_up_apply_data(); + auto coo_mtx = gko::matrix::Coo<>::create(ref); + auto dcoo_mtx = gko::matrix::Coo<>::create(hip); + + x->convert_to(coo_mtx.get()); + dx->convert_to(dcoo_mtx.get()); + + ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), + coo_mtx->get_num_stored_elements()); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); +} + + +TEST_F(Dense, MoveToCooIsEquivalentToRef) +{ + set_up_apply_data(); + auto coo_mtx = gko::matrix::Coo<>::create(ref); + auto dcoo_mtx = gko::matrix::Coo<>::create(hip); + + x->move_to(coo_mtx.get()); + dx->move_to(dcoo_mtx.get()); + + ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), + coo_mtx->get_num_stored_elements()); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); +} + + +TEST_F(Dense, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + x->convert_to(csr_mtx.get()); + dx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); +} + + +TEST_F(Dense, MoveToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + x->move_to(csr_mtx.get()); + dx->move_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); +} + + +TEST_F(Dense, ConvertToEllIsEquivalentToRef) +{ + set_up_apply_data(); + auto ell_mtx = gko::matrix::Ell<>::create(ref); + auto dell_mtx = gko::matrix::Ell<>::create(hip); + + x->convert_to(ell_mtx.get()); + dx->convert_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); +} + + +TEST_F(Dense, MoveToEllIsEquivalentToRef) +{ + set_up_apply_data(); + auto ell_mtx = gko::matrix::Ell<>::create(ref); + auto dell_mtx = gko::matrix::Ell<>::create(hip); + + x->move_to(ell_mtx.get()); + dx->move_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); +} + + +TEST_F(Dense, ConvertToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp<>::create(ref); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + x->convert_to(sellp_mtx.get()); + dx->convert_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); +} + + +TEST_F(Dense, MoveToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp<>::create(ref); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + x->move_to(sellp_mtx.get()); + dx->move_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); +} + + +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(hip); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + +TEST_F(Dense, CountNNZIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type nnz; + gko::size_type dnnz; + + gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz); + gko::kernels::hip::dense::count_nonzeros(hip, dx.get(), &dnnz); + + ASSERT_EQ(nnz, dnnz); +} + + +TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + gko::Array nnz_per_row(ref); + nnz_per_row.resize_and_reset(x->get_size()[0]); + gko::Array dnnz_per_row(hip); + dnnz_per_row.resize_and_reset(dx->get_size()[0]); + + gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), + &nnz_per_row); + gko::kernels::hip::dense::calculate_nonzeros_per_row(hip, dx.get(), + &dnnz_per_row); + + auto tmp = gko::Array(ref, dnnz_per_row); + for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); + } +} + + +TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type max_nnz; + gko::size_type dmax_nnz; + + gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(), + &max_nnz); + gko::kernels::hip::dense::calculate_max_nnz_per_row(hip, dx.get(), + &dmax_nnz); + + ASSERT_EQ(max_nnz, dmax_nnz); +} + + +TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type total_cols; + gko::size_type dtotal_cols; + + gko::kernels::reference::dense::calculate_total_cols( + ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size); + gko::kernels::hip::dense::calculate_total_cols( + hip, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); + + ASSERT_EQ(total_cols, dtotal_cols); +} + + +TEST_F(Dense, IsRowPermutable) +{ + set_up_apply_data(); + + auto r_permute = x->row_permute(rpermute_idxs.get()); + auto dr_permute = dx->row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), + static_cast(dr_permute.get()), 0); +} + + +TEST_F(Dense, IsColPermutable) +{ + set_up_apply_data(); + + auto c_permute = x->column_permute(cpermute_idxs.get()); + auto dc_permute = dx->column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), + static_cast(dc_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseRowPermutable) +{ + set_up_apply_data(); + + auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), + static_cast(d_inverse_r_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseColPermutable) +{ + set_up_apply_data(); + + auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), + static_cast(d_inverse_c_permute.get()), 0); +} + + +} // namespace diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp new file mode 100644 index 00000000000..c28285ae885 --- /dev/null +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -0,0 +1,347 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/matrix/ell_kernels.hpp" +#include "core/test/utils.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Ell : public ::testing::Test { +protected: + using Mtx = gko::matrix::Ell<>; + using Vec = gko::matrix::Dense<>; + + Ell() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, std::uniform_int_distribution<>(1, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(int num_rows = 532, int num_cols = 231, + int num_vectors = 1, + int num_stored_elements_per_row = 0, int stride = 0) + { + mtx = Mtx::create(ref, gko::dim<2>{}, num_stored_elements_per_row, + stride); + mtx->copy_from(gen_mtx(num_rows, num_cols)); + expected = gen_mtx(num_rows, num_vectors); + y = gen_mtx(num_cols, num_vectors); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dmtx = Mtx::create(hip); + dmtx->copy_from(mtx.get()); + dresult = Vec::create(hip); + dresult->copy_from(expected.get()); + dy = Vec::create(hip); + dy->copy_from(y.get()); + dalpha = Vec::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(hip); + dbeta->copy_from(beta.get()); + } + + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr mtx; + std::unique_ptr expected; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + + std::unique_ptr dmtx; + std::unique_ptr dresult; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(Ell, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) +{ + set_up_apply_data(532, 231, 1, 300, 600); + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef) +{ + set_up_apply_data(10, 10000); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedByAtomicApplyIsEquivalentToRef) +{ + set_up_apply_data(10, 10000); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyByAtomicToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(10, 10000, 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedByAtomicToDenseMatrixApplyIsEquivalentToRef) +{ + set_up_apply_data(10, 10000, 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyOnSmallMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10, 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10, 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, ConvertToDenseIsEquivalentToRef) +{ + set_up_apply_data(); + + auto dense_mtx = gko::matrix::Dense<>::create(ref); + auto ddense_mtx = gko::matrix::Dense<>::create(hip); + + mtx->convert_to(dense_mtx.get()); + dmtx->convert_to(ddense_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); +} + + +TEST_F(Ell, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + mtx->convert_to(csr_mtx.get()); + dmtx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + +TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + + gko::Array nnz_per_row; + nnz_per_row.set_executor(ref); + nnz_per_row.resize_and_reset(mtx->get_size()[0]); + + gko::Array dnnz_per_row; + dnnz_per_row.set_executor(hip); + dnnz_per_row.resize_and_reset(dmtx->get_size()[0]); + + gko::kernels::reference::ell::calculate_nonzeros_per_row(ref, mtx.get(), + &nnz_per_row); + gko::kernels::hip::ell::calculate_nonzeros_per_row(hip, dmtx.get(), + &dnnz_per_row); + + auto tmp = gko::Array(ref, dnnz_per_row); + for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); + } +} + + +TEST_F(Ell, CountNNZIsEquivalentToRef) +{ + set_up_apply_data(); + + gko::size_type nnz; + gko::size_type dnnz; + + gko::kernels::reference::ell::count_nonzeros(ref, mtx.get(), &nnz); + gko::kernels::hip::ell::count_nonzeros(hip, dmtx.get(), &dnnz); + + ASSERT_EQ(nnz, dnnz); +} + + +} // namespace diff --git a/hip/test/matrix/hybrid_kernels.hip.cpp b/hip/test/matrix/hybrid_kernels.hip.cpp new file mode 100644 index 00000000000..83d2cc37c86 --- /dev/null +++ b/hip/test/matrix/hybrid_kernels.hip.cpp @@ -0,0 +1,222 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/matrix/hybrid_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Hybrid : public ::testing::Test { +protected: + using Mtx = gko::matrix::Hybrid<>; + using Vec = gko::matrix::Dense<>; + + Hybrid() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols, int min_nnz_row) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(min_nnz_row, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(int num_vectors = 1, + std::shared_ptr strategy = + std::make_shared()) + { + mtx = Mtx::create(ref, strategy); + mtx->copy_from(gen_mtx(532, 231, 1)); + expected = gen_mtx(532, num_vectors, 1); + y = gen_mtx(231, num_vectors, 1); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dmtx = Mtx::create(hip, strategy); + dmtx->copy_from(mtx.get()); + dresult = Vec::create(hip); + dresult->copy_from(expected.get()); + dy = Vec::create(hip); + dy->copy_from(y.get()); + dalpha = Vec::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(hip); + dbeta->copy_from(beta.get()); + } + + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr mtx; + std::unique_ptr expected; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + + std::unique_ptr dmtx; + std::unique_ptr dresult; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(Hybrid, SubMatrixExecutorAfterCopyIsEquivalentToExcutor) +{ + set_up_apply_data(); + + auto coo_mtx = dmtx->get_coo(); + auto ell_mtx = dmtx->get_ell(); + + ASSERT_EQ(coo_mtx->get_executor(), hip); + ASSERT_EQ(ell_mtx->get_executor(), hip); + ASSERT_EQ(dmtx->get_executor(), hip); +} + + +TEST_F(Hybrid, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Hybrid, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Hybrid, SimpleApplyToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Hybrid, AdvancedApplyToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Hybrid, CountNonzerosIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type nonzeros; + gko::size_type dnonzeros; + + gko::kernels::reference::hybrid::count_nonzeros(ref, mtx.get(), &nonzeros); + gko::kernels::hip::hybrid::count_nonzeros(hip, dmtx.get(), &dnonzeros); + + ASSERT_EQ(nonzeros, dnonzeros); +} + + +TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_data(1, std::make_shared(2)); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + mtx->convert_to(csr_mtx.get()); + dmtx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + +TEST_F(Hybrid, MoveToCsrIsEquivalentToRef) +{ + set_up_apply_data(1, std::make_shared(2)); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + mtx->move_to(csr_mtx.get()); + dmtx->move_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + +} // namespace diff --git a/hip/test/matrix/sellp_kernels.hip.cpp b/hip/test/matrix/sellp_kernels.hip.cpp new file mode 100644 index 00000000000..410b8f58a19 --- /dev/null +++ b/hip/test/matrix/sellp_kernels.hip.cpp @@ -0,0 +1,297 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/matrix/sellp_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Sellp : public ::testing::Test { +protected: + using Mtx = gko::matrix::Sellp<>; + using Vec = gko::matrix::Dense<>; + + Sellp() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, std::uniform_int_distribution<>(1, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_vector( + int slice_size = gko::matrix::default_slice_size, + int stride_factor = gko::matrix::default_stride_factor, + int total_cols = 0) + { + mtx = Mtx::create(ref); + mtx->copy_from(gen_mtx(532, 231)); + expected = gen_mtx(532, 1); + y = gen_mtx(231, 1); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dmtx = Mtx::create(hip); + dmtx->copy_from(mtx.get()); + dresult = Vec::create(hip); + dresult->copy_from(expected.get()); + dy = Vec::create(hip); + dy->copy_from(y.get()); + dalpha = Vec::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(hip); + dbeta->copy_from(beta.get()); + } + + void set_up_apply_matrix( + int slice_size = gko::matrix::default_slice_size, + int stride_factor = gko::matrix::default_stride_factor, + int total_cols = 0) + { + mtx = Mtx::create(ref); + mtx->copy_from(gen_mtx(532, 231)); + expected = gen_mtx(532, 64); + y = gen_mtx(231, 64); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dmtx = Mtx::create(hip); + dmtx->copy_from(mtx.get()); + dresult = Vec::create(hip); + dresult->copy_from(expected.get()); + dy = Vec::create(hip); + dy->copy_from(y.get()); + dalpha = Vec::create(hip); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(hip); + dbeta->copy_from(beta.get()); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr mtx; + std::unique_ptr expected; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + + std::unique_ptr dmtx; + std::unique_ptr dresult; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(Sellp, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_vector(); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_vector(); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, SimpleApplyWithSliceSizeAndStrideFactorIsEquivalentToRef) +{ + set_up_apply_vector(32, 2); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, AdvancedApplyWithSliceSizeAndStrideFActorIsEquivalentToRef) +{ + set_up_apply_vector(32, 2); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, SimpleApplyMultipleRHSIsEquivalentToRef) +{ + set_up_apply_matrix(); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, AdvancedApplyMultipleRHSIsEquivalentToRef) +{ + set_up_apply_matrix(); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, + SimpleApplyMultipleRHSWithSliceSizeAndStrideFactorIsEquivalentToRef) +{ + set_up_apply_matrix(32, 2); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, + AdvancedApplyMultipleRHSWithSliceSizeAndStrideFActorIsEquivalentToRef) +{ + set_up_apply_matrix(32, 2); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + auto result = Vec::create(ref); + result->copy_from(dresult.get()); + GKO_ASSERT_MTX_NEAR(result, expected, 1e-14); +} + + +TEST_F(Sellp, ConvertToDenseIsEquivalentToRef) +{ + set_up_apply_matrix(); + + auto dense_mtx = gko::matrix::Dense<>::create(ref); + auto ddense_mtx = gko::matrix::Dense<>::create(hip); + + mtx->convert_to(dense_mtx.get()); + dmtx->convert_to(ddense_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); +} + + +TEST_F(Sellp, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_matrix(); + + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(hip); + + mtx->convert_to(csr_mtx.get()); + dmtx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + +TEST_F(Sellp, CountNonzerosIsEquivalentToRef) +{ + set_up_apply_matrix(); + + gko::size_type nnz; + gko::size_type dnnz; + + gko::kernels::reference::sellp::count_nonzeros(ref, mtx.get(), &nnz); + gko::kernels::hip::sellp::count_nonzeros(hip, dmtx.get(), &dnnz); + + ASSERT_EQ(nnz, dnnz); +} + + +} // namespace diff --git a/hip/test/preconditioner/CMakeLists.txt b/hip/test/preconditioner/CMakeLists.txt new file mode 100644 index 00000000000..6f974174421 --- /dev/null +++ b/hip/test/preconditioner/CMakeLists.txt @@ -0,0 +1,2 @@ +ginkgo_create_hip_test_special_linkage(jacobi_kernels) +ginkgo_create_hip_test(isai_kernels) diff --git a/hip/test/preconditioner/isai_kernels.hip.cpp b/hip/test/preconditioner/isai_kernels.hip.cpp new file mode 100644 index 00000000000..88f67c0adb3 --- /dev/null +++ b/hip/test/preconditioner/isai_kernels.hip.cpp @@ -0,0 +1,326 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/preconditioner/isai_kernels.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +enum struct matrix_type { lower, upper }; +class Isai : public ::testing::Test { +protected: + using value_type = double; + using index_type = gko::int32; + using Csr = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + Isai() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + std::unique_ptr clone_allocations(const Csr *csr_mtx) + { + if (csr_mtx->get_executor() != ref) { + return {nullptr}; + } + const auto num_elems = csr_mtx->get_num_stored_elements(); + auto sparsity = csr_mtx->clone(); + + // values are now filled with invalid data to catch potential errors + auto begin_values = sparsity->get_values(); + auto end_values = begin_values + num_elems; + std::fill(begin_values, end_values, -gko::one()); + return sparsity; + } + + void initialize_data(matrix_type type, gko::size_type n, + gko::size_type row_limit) + { + const bool for_lower_tm = type == matrix_type::lower; + auto nz_dist = std::uniform_int_distribution(1, row_limit); + auto val_dist = std::uniform_real_distribution(-1., 1.); + mtx = Csr::create(ref); + mtx = gko::test::generate_random_triangular_matrix( + n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref, + gko::dim<2>{n, n}); + inverse = clone_allocations(mtx.get()); + + d_mtx = Csr::create(hip); + d_mtx->copy_from(mtx.get()); + d_inverse = Csr::create(hip); + d_inverse->copy_from(inverse.get()); + } + + + std::shared_ptr ref; + std::shared_ptr hip; + + std::default_random_engine rand_engine; + + std::unique_ptr mtx; + std::unique_ptr inverse; + + std::unique_ptr d_mtx; + std::unique_ptr d_inverse; +}; + + +TEST_F(Isai, HipIsaiGenerateLinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 536, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(hip, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::hip::isai::generate_tri_inverse( + hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, HipIsaiGenerateUinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 615, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(hip, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::hip::isai::generate_tri_inverse( + hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, HipIsaiGenerateLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 554, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(hip, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::hip::isai::generate_tri_inverse( + hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, HipIsaiGenerateUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 695, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(hip, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::hip::isai::generate_tri_inverse( + hip, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, HipIsaiGenerateExcessLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 518, 40); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(hip, a1); + gko::Array da2(hip, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(hip, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(hip, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get()); + gko::kernels::hip::isai::generate_excess_system( + hip, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, HipIsaiGenerateExcessUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 673, 51); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(hip, a1); + gko::Array da2(hip, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(hip, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(hip, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get()); + gko::kernels::hip::isai::generate_excess_system( + hip, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, HipIsaiScatterExcessSolutionLIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 572, 52); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(hip, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = Dense::create(hip); + de_rhs->copy_from(lend(e_rhs)); + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get()); + gko::kernels::hip::isai::scatter_excess_solution( + hip, da1.get_const_data(), de_rhs.get(), d_inverse.get()); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, HipIsaiScatterExcessSolutionUIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 702, 45); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(hip, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = Dense::create(hip); + de_rhs->copy_from(lend(e_rhs)); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get()); + gko::kernels::hip::isai::scatter_excess_solution( + hip, da1.get_const_data(), de_rhs.get(), d_inverse.get()); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +} // namespace diff --git a/hip/test/preconditioner/jacobi_kernels.cpp b/hip/test/preconditioner/jacobi_kernels.cpp new file mode 100644 index 00000000000..f1863a6b42f --- /dev/null +++ b/hip/test/preconditioner/jacobi_kernels.cpp @@ -0,0 +1,847 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Jacobi : public ::testing::Test { +protected: + using Bj = gko::preconditioner::Jacobi<>; + using Mtx = gko::matrix::Csr<>; + using Vec = gko::matrix::Dense<>; + using mtx_data = gko::matrix_data<>; + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + void initialize_data( + std::initializer_list block_pointers, + std::initializer_list block_precisions, + std::initializer_list condition_numbers, + gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1, + double accuracy = 0.1) + { + std::ranlux48 engine(42); + const auto dim = *(end(block_pointers) - 1); + if (condition_numbers.size() == 0) { + mtx = gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), + std::normal_distribution<>(0.0, 1.0), engine, ref); + } else { + std::vector blocks; + for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { + const auto size = + begin(block_pointers)[i + 1] - begin(block_pointers)[i]; + const auto cond = begin(condition_numbers)[i]; + blocks.push_back(mtx_data::cond( + size, cond, std::normal_distribution<>(-1, 1), engine)); + } + mtx = Mtx::create(ref); + mtx->read(mtx_data::diag(begin(blocks), end(blocks))); + } + gko::Array block_ptrs(ref, block_pointers); + gko::Array block_prec(ref, block_precisions); + if (block_prec.get_num_elems() == 0) { + bj_factory = + Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_max_block_stride(gko::uint32(hip->get_warp_size())) + .on(ref); + d_bj_factory = Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .on(hip); + } else { + bj_factory = + Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_max_block_stride(gko::uint32(hip->get_warp_size())) + .with_storage_optimization(block_prec) + .with_accuracy(accuracy) + .on(ref); + d_bj_factory = Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_storage_optimization(block_prec) + .with_accuracy(accuracy) + .on(hip); + } + b = gko::test::generate_random_matrix( + dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), + std::normal_distribution<>(0.0, 1.0), engine, ref); + d_b = Vec::create(hip); + d_b->copy_from(b.get()); + x = gko::test::generate_random_matrix( + dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), + std::normal_distribution<>(0.0, 1.0), engine, ref); + d_x = Vec::create(hip); + d_x->copy_from(x.get()); + } + + const gko::precision_reduction dp{}; + const gko::precision_reduction sp{0, 1}; + const gko::precision_reduction hp{0, 2}; + const gko::precision_reduction tp{1, 0}; + const gko::precision_reduction qp{2, 0}; + const gko::precision_reduction up{1, 1}; + const gko::precision_reduction ap{gko::precision_reduction::autodetect()}; + + std::shared_ptr ref; + std::shared_ptr hip; + std::shared_ptr mtx; + std::unique_ptr x; + std::unique_ptr b; + std::unique_ptr d_x; + std::unique_ptr d_b; + + std::unique_ptr bj_factory; + std::unique_ptr d_bj_factory; +}; + + +TEST_F(Jacobi, HipFindNaturalBlocksEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + */ + auto mtx = share(Mtx::create(ref)); + mtx->read({{4, 4}, + {{0, 0, 1.0}, + {0, 1, 1.0}, + {1, 0, 1.0}, + {1, 1, 1.0}, + {2, 0, 1.0}, + {2, 2, 1.0}, + {3, 0, 1.0}, + {3, 2, 1.0}}}); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, HipExecutesSupervariableAgglomerationEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + 1 + */ + auto mtx = share(Mtx::create(ref)); + mtx->read({{5, 5}, + {{0, 0, 1.0}, + {0, 1, 1.0}, + {1, 0, 1.0}, + {1, 1, 1.0}, + {2, 2, 1.0}, + {2, 3, 1.0}, + {3, 2, 1.0}, + {3, 3, 1.0}, + {4, 4, 1.0}}}); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, HipFindNaturalBlocksInLargeMatrixEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + 1 1 + 1 1 + */ + using data = gko::matrix_data; + auto mtx = share(Mtx::create(ref)); + mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0, 0.0}, + {1.0, 1.0, 0.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}})); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, + HipExecutesSupervariableAgglomerationInLargeMatrixEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + 1 + */ + using data = gko::matrix_data; + auto mtx = share(Mtx::create(ref)); + mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 1.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 1.0, 1.0, 0.0}, + {0.0, 0.0, 1.0, 1.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 1.0}})); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, + HipExecutesSupervarAgglomerationEquivalentToRefFor150NonzerowsPerRow) +{ + /* example matrix duplicated 50 times: + 1 1 1 + 1 1 1 + 1 1 1 + 1 1 1 + 1 1 + */ + using data = gko::matrix_data; + auto mtx = share(Mtx::create(ref)); + mtx->read({{50, 50}, + {{1.0, 1.0, 0.0, 1.0, 0.0}, + {1.0, 1.0, 0.0, 1.0, 0.0}, + {1.0, 0.0, 1.0, 1.0, 0.0}, + {1.0, 0.0, 1.0, 1.0, 0.0}, + {0.0, 0.0, 1.0, 0.0, 1.0}}}); + + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(hip)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithBlockSize32) +{ + initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +#if GINKGO_HIP_PLATFORM_HCC +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithBlockSize64) +{ + initialize_data({0, 64, 128, 192, 256}, {}, {}, 64, 100, 110); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} +#endif + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithDifferentBlockSize) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +TEST_F(Jacobi, HipTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, HipConjTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithBlockSize32) +{ + initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +#if GINKGO_HIP_PLATFORM_HCC +TEST_F(Jacobi, HipApplyEquivalentToRefWithBlockSize64) +{ + initialize_data({0, 64, 128, 192, 256}, {}, {}, 64, 100, 111); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} +#endif + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithDifferentBlockSize) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, + 97, 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, hip); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, hip); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(alpha.get(), b.get(), beta.get(), x.get()); + d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, HipLinearCombinationApplyToMultipleVectorsEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99, 5); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, hip); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, hip); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(alpha.get(), b.get(), beta.get(), x.get()); + d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = clone(ref, d_bj_factory->generate(mtx)); + + for (int i = 0; i < gko::as(bj.get())->get_num_blocks(); ++i) { + EXPECT_NEAR(bj->get_conditioning()[i], d_bj->get_conditioning()[i], + 1e-9); + } +} + + +TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef) +{ + initialize_data( + {0, 2, 14, 27, 40, 51, 61, 70, 80, 92, 100}, + {ap, ap, ap, ap, ap, ap, ap, ap, ap, ap}, + {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 13, 97, + 99, 1, 0.2); + + auto bj = bj_factory->generate(mtx); + auto d_bj = gko::clone(ref, d_bj_factory->generate(mtx)); + + auto bj_prec = + bj->get_parameters().storage_optimization.block_wise.get_const_data(); + auto d_bj_prec = + d_bj->get_parameters().storage_optimization.block_wise.get_const_data(); + for (int i = 0; i < gko::as(bj.get())->get_num_blocks(); ++i) { + EXPECT_EQ(bj_prec[i], d_bj_prec[i]); + } +} + + +TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) +{ + auto mtx = gko::matrix::Csr<>::create(hip); + // clang-format off + mtx->read(mtx_data::diag({ + // perfectly conditioned block, small value difference, + // can use fp16 (5, 10) + {{2.0, 1.0}, + {1.0, 2.0}}, + // perfectly conditioned block (scaled orthogonal), + // with large value difference, need fp16 (7, 8) + {{1e-8, -1e-16}, + {1e-16, 1e-8}} + })); + // clang-format on + + auto bj = + Bj::build() + .with_max_block_size(13u) + .with_block_pointers(gko::Array(hip, {0, 2, 4})) + .with_storage_optimization(gko::precision_reduction::autodetect()) + .with_accuracy(0.1) + .on(hip) + ->generate(give(mtx)); + + // both blocks are in the same group, both need (7, 8) + auto h_bj = clone(ref, bj); + auto prec = + h_bj->get_parameters().storage_optimization.block_wise.get_const_data(); + EXPECT_EQ(prec[0], gko::precision_reduction(1, 1)); + ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithFullPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-13); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-7); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithCustomReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-3); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithCustomQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1); +} + + +TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1); +} + + +TEST_F(Jacobi, HipTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + bj->copy_from(d_bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, + HipConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + bj->copy_from(d_bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithFullPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithCustomReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-5); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithCustomReducedAndReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {up, up, up, up, up, up, up, up, up, up, up}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithCustomQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, HipApplyEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1); +} + + +TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, + 99); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, hip); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, hip); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRefWithFullPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, + 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRefWithReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1); +} + + +TEST_F( + Jacobi, + HipLinearCombinationApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, + 99, 5); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, hip); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, hip); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +} // namespace diff --git a/hip/test/solver/CMakeLists.txt b/hip/test/solver/CMakeLists.txt new file mode 100644 index 00000000000..3ec7956cf65 --- /dev/null +++ b/hip/test/solver/CMakeLists.txt @@ -0,0 +1,9 @@ +ginkgo_create_hip_test_special_linkage(bicg_kernels) +ginkgo_create_hip_test_special_linkage(bicgstab_kernels) +ginkgo_create_hip_test_special_linkage(cg_kernels) +ginkgo_create_hip_test_special_linkage(cgs_kernels) +ginkgo_create_hip_test_special_linkage(fcg_kernels) +ginkgo_create_hip_test_special_linkage(gmres_kernels) +ginkgo_create_hip_test_special_linkage(ir_kernels) +ginkgo_create_hip_test_special_linkage(lower_trs_kernels) +ginkgo_create_hip_test_special_linkage(upper_trs_kernels) diff --git a/hip/test/solver/bicg_kernels.cpp b/hip/test/solver/bicg_kernels.cpp new file mode 100644 index 00000000000..67fda77f84b --- /dev/null +++ b/hip/test/solver/bicg_kernels.cpp @@ -0,0 +1,357 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/bicg_kernels.hpp" +#include "hip/test/utils.hip.hpp" +#include "matrices/config.hpp" + + +namespace { + + +class Bicg : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Mtx = gko::matrix::Dense<>; + using Csr = gko::matrix::Csr; + Bicg() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + + std::string file_name(gko::matrices::location_ani1_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + csr_ref = gko::read(input_file, ref); + auto csr_hip_temp = Csr::create(hip); + csr_hip_temp->copy_from(gko::lend(csr_ref)); + csr_hip = gko::give(csr_hip_temp); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 43; + b = gen_mtx(m, n); + r = gen_mtx(m, n); + z = gen_mtx(m, n); + p = gen_mtx(m, n); + q = gen_mtx(m, n); + r2 = gen_mtx(m, n); + z2 = gen_mtx(m, n); + p2 = gen_mtx(m, n); + q2 = gen_mtx(m, n); + x = gen_mtx(m, n); + beta = gen_mtx(1, n); + prev_rho = gen_mtx(1, n); + rho = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + + d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + d_r = Mtx::create(hip); + d_r->copy_from(r.get()); + d_z = Mtx::create(hip); + d_z->copy_from(z.get()); + d_p = Mtx::create(hip); + d_p->copy_from(p.get()); + d_q = Mtx::create(hip); + d_q->copy_from(q.get()); + d_r2 = Mtx::create(hip); + d_r2->copy_from(r2.get()); + d_z2 = Mtx::create(hip); + d_z2->copy_from(z2.get()); + d_p2 = Mtx::create(hip); + d_p2->copy_from(p2.get()); + d_q2 = Mtx::create(hip); + d_q2->copy_from(q2.get()); + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_beta = Mtx::create(hip); + d_beta->copy_from(beta.get()); + d_prev_rho = Mtx::create(hip); + d_prev_rho->copy_from(prev_rho.get()); + d_rho = Mtx::create(hip); + d_rho->copy_from(rho.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(hip, n)); + *d_stop_status = *stop_status; + } + + void make_symetric(Mtx *mtx) + { + for (int i = 0; i < mtx->get_size()[0]; ++i) { + for (int j = i + 1; j < mtx->get_size()[1]; ++j) { + mtx->at(i, j) = mtx->at(j, i); + } + } + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + void make_spd(Mtx *mtx) + { + make_symetric(mtx); + make_diag_dominant(mtx); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr z; + std::unique_ptr p; + std::unique_ptr q; + std::unique_ptr r2; + std::unique_ptr z2; + std::unique_ptr p2; + std::unique_ptr q2; + std::unique_ptr x; + std::unique_ptr beta; + std::unique_ptr prev_rho; + std::unique_ptr rho; + std::unique_ptr> stop_status; + + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_z; + std::unique_ptr d_p; + std::unique_ptr d_q; + std::unique_ptr d_r2; + std::unique_ptr d_z2; + std::unique_ptr d_p2; + std::unique_ptr d_q2; + std::unique_ptr d_x; + std::unique_ptr d_beta; + std::unique_ptr d_prev_rho; + std::unique_ptr d_rho; + std::unique_ptr> d_stop_status; + std::shared_ptr csr_ref; + std::shared_ptr csr_hip; +}; + + +TEST_F(Bicg, HipBicgInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::initialize( + ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), + rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get()); + gko::kernels::hip::bicg::initialize( + hip, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), + d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(), + d_q2.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Bicg, HipBicgStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(), + z2.get(), rho.get(), prev_rho.get(), + stop_status.get()); + gko::kernels::hip::bicg::step_1(hip, d_p.get(), d_z.get(), d_p2.get(), + d_z2.get(), d_rho.get(), d_prev_rho.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14); +} + + +TEST_F(Bicg, HipBicgStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::step_2( + ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(), + rho.get(), stop_status.get()); + gko::kernels::hip::bicg::step_2( + hip, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), d_q2.get(), + d_beta.get(), d_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14); +} + + +TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + make_spd(mtx.get()); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = Mtx::create(hip); + d_mtx->copy_from(mtx.get()); + auto d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + auto bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(hip), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(hip)) + .on(hip); + auto solver = bicg_factory->generate(std::move(mtx)); + auto d_solver = d_bicg_factory->generate(std::move(d_mtx)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Bicg, ApplyWithSuiteSparseMatrixIsEquivalentToRef) +{ + auto x = gen_mtx(36, 1); + auto b = gen_mtx(36, 1); + auto d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + auto bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(hip), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(hip)) + .on(hip); + auto solver = bicg_factory->generate(std::move(csr_ref)); + auto d_solver = d_bicg_factory->generate(std::move(csr_hip)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/hip/test/solver/bicgstab_kernels.cpp b/hip/test/solver/bicgstab_kernels.cpp new file mode 100644 index 00000000000..999b40bebaa --- /dev/null +++ b/hip/test/solver/bicgstab_kernels.cpp @@ -0,0 +1,357 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/solver/bicgstab_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Bicgstab : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + using Solver = gko::solver::Bicgstab<>; + + Bicgstab() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + + mtx = gen_mtx(123, 123); + make_diag_dominant(mtx.get()); + d_mtx = Mtx::create(hip); + d_mtx->copy_from(mtx.get()); + + hip_bicgstab_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(246u).on(hip), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-15) + .on(hip)) + .on(hip); + ref_bicgstab_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(246u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-15) + .on(ref)) + .on(ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 17; + x = gen_mtx(m, n); + b = gen_mtx(m, n); + r = gen_mtx(m, n); + z = gen_mtx(m, n); + p = gen_mtx(m, n); + rr = gen_mtx(m, n); + s = gen_mtx(m, n); + t = gen_mtx(m, n); + y = gen_mtx(m, n); + v = gen_mtx(m, n); + prev_rho = gen_mtx(1, n); + rho = gen_mtx(1, n); + alpha = gen_mtx(1, n); + beta = gen_mtx(1, n); + gamma = gen_mtx(1, n); + omega = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < n; ++i) { + stop_status->get_data()[i].reset(); + } + + d_x = Mtx::create(hip); + d_b = Mtx::create(hip); + d_r = Mtx::create(hip); + d_z = Mtx::create(hip); + d_p = Mtx::create(hip); + d_t = Mtx::create(hip); + d_s = Mtx::create(hip); + d_y = Mtx::create(hip); + d_v = Mtx::create(hip); + d_rr = Mtx::create(hip); + d_prev_rho = Mtx::create(hip); + d_rho = Mtx::create(hip); + d_alpha = Mtx::create(hip); + d_beta = Mtx::create(hip); + d_gamma = Mtx::create(hip); + d_omega = Mtx::create(hip); + d_stop_status = std::unique_ptr>( + new gko::Array(hip)); + + d_x->copy_from(x.get()); + d_b->copy_from(b.get()); + d_r->copy_from(r.get()); + d_z->copy_from(z.get()); + d_p->copy_from(p.get()); + d_v->copy_from(v.get()); + d_y->copy_from(y.get()); + d_t->copy_from(t.get()); + d_s->copy_from(s.get()); + d_rr->copy_from(rr.get()); + d_prev_rho->copy_from(prev_rho.get()); + d_rho->copy_from(rho.get()); + d_alpha->copy_from(alpha.get()); + d_beta->copy_from(beta.get()); + d_gamma->copy_from(gamma.get()); + d_omega->copy_from(omega.get()); + *d_stop_status = + *stop_status; // copy_from is not a public member function of Array + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::shared_ptr mtx; + std::shared_ptr d_mtx; + std::unique_ptr hip_bicgstab_factory; + std::unique_ptr ref_bicgstab_factory; + + std::unique_ptr x; + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr z; + std::unique_ptr p; + std::unique_ptr rr; + std::unique_ptr s; + std::unique_ptr t; + std::unique_ptr y; + std::unique_ptr v; + std::unique_ptr prev_rho; + std::unique_ptr rho; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr gamma; + std::unique_ptr omega; + std::unique_ptr> stop_status; + + std::unique_ptr d_x; + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_z; + std::unique_ptr d_p; + std::unique_ptr d_t; + std::unique_ptr d_s; + std::unique_ptr d_y; + std::unique_ptr d_v; + std::unique_ptr d_rr; + std::unique_ptr d_prev_rho; + std::unique_ptr d_rho; + std::unique_ptr d_alpha; + std::unique_ptr d_beta; + std::unique_ptr d_gamma; + std::unique_ptr d_omega; + std::unique_ptr> d_stop_status; +}; + + +TEST_F(Bicgstab, HipBicgstabInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicgstab::initialize( + ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(), + v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(), + gamma.get(), omega.get(), stop_status.get()); + gko::kernels::hip::bicgstab::initialize( + hip, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(), + d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(), + d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(), + d_stop_status.get()); + + GKO_EXPECT_MTX_NEAR(d_r, r, 1e-14); + GKO_EXPECT_MTX_NEAR(d_z, z, 1e-14); + GKO_EXPECT_MTX_NEAR(d_p, p, 1e-14); + GKO_EXPECT_MTX_NEAR(d_y, y, 1e-14); + GKO_EXPECT_MTX_NEAR(d_t, t, 1e-14); + GKO_EXPECT_MTX_NEAR(d_s, s, 1e-14); + GKO_EXPECT_MTX_NEAR(d_rr, rr, 1e-14); + GKO_EXPECT_MTX_NEAR(d_v, v, 1e-14); + GKO_EXPECT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); + GKO_EXPECT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_EXPECT_MTX_NEAR(d_alpha, alpha, 1e-14); + GKO_EXPECT_MTX_NEAR(d_beta, beta, 1e-14); + GKO_EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14); + GKO_EXPECT_MTX_NEAR(d_omega, omega, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Bicgstab, HipBicgstabStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicgstab::step_1( + ref, r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(), + omega.get(), stop_status.get()); + gko::kernels::hip::bicgstab::step_1( + hip, d_r.get(), d_p.get(), d_v.get(), d_rho.get(), d_prev_rho.get(), + d_alpha.get(), d_omega.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); +} + + +TEST_F(Bicgstab, HipBicgstabStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicgstab::step_2(ref, r.get(), s.get(), v.get(), + rho.get(), alpha.get(), + beta.get(), stop_status.get()); + gko::kernels::hip::bicgstab::step_2(hip, d_r.get(), d_s.get(), d_v.get(), + d_rho.get(), d_alpha.get(), + d_beta.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14); + GKO_ASSERT_MTX_NEAR(d_s, s, 1e-14); +} + + +TEST_F(Bicgstab, HipBicgstabStep3IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicgstab::step_3( + ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), + beta.get(), gamma.get(), omega.get(), stop_status.get()); + gko::kernels::hip::bicgstab::step_3( + hip, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(), + d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_omega, omega, 1e-14); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); +} + + +TEST_F(Bicgstab, HipBicgstabApplyOneRHSIsEquivalentToRef) +{ + int m = 123; + int n = 1; + auto ref_solver = ref_bicgstab_factory->generate(mtx); + auto hip_solver = hip_bicgstab_factory->generate(d_mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(hip); + auto d_x = Mtx::create(hip); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + hip_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + + +TEST_F(Bicgstab, HipBicgstabApplyMultipleRHSIsEquivalentToRef) +{ + int m = 123; + int n = 16; + auto hip_solver = hip_bicgstab_factory->generate(d_mtx); + auto ref_solver = ref_bicgstab_factory->generate(mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(hip); + auto d_x = Mtx::create(hip); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + hip_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + + +} // namespace diff --git a/hip/test/solver/cg_kernels.cpp b/hip/test/solver/cg_kernels.cpp new file mode 100644 index 00000000000..db472f22000 --- /dev/null +++ b/hip/test/solver/cg_kernels.cpp @@ -0,0 +1,272 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/cg_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Cg : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + Cg() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 43; + b = gen_mtx(m, n); + r = gen_mtx(m, n); + z = gen_mtx(m, n); + p = gen_mtx(m, n); + q = gen_mtx(m, n); + x = gen_mtx(m, n); + beta = gen_mtx(1, n); + prev_rho = gen_mtx(1, n); + rho = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + + d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + d_r = Mtx::create(hip); + d_r->copy_from(r.get()); + d_z = Mtx::create(hip); + d_z->copy_from(z.get()); + d_p = Mtx::create(hip); + d_p->copy_from(p.get()); + d_q = Mtx::create(hip); + d_q->copy_from(q.get()); + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_beta = Mtx::create(hip); + d_beta->copy_from(beta.get()); + d_prev_rho = Mtx::create(hip); + d_prev_rho->copy_from(prev_rho.get()); + d_rho = Mtx::create(hip); + d_rho->copy_from(rho.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(hip, n)); + *d_stop_status = *stop_status; + } + + void make_symetric(Mtx *mtx) + { + for (int i = 0; i < mtx->get_size()[0]; ++i) { + for (int j = i + 1; j < mtx->get_size()[1]; ++j) { + mtx->at(i, j) = mtx->at(j, i); + } + } + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + void make_spd(Mtx *mtx) + { + make_symetric(mtx); + make_diag_dominant(mtx); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr z; + std::unique_ptr p; + std::unique_ptr q; + std::unique_ptr x; + std::unique_ptr beta; + std::unique_ptr prev_rho; + std::unique_ptr rho; + std::unique_ptr> stop_status; + + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_z; + std::unique_ptr d_p; + std::unique_ptr d_q; + std::unique_ptr d_x; + std::unique_ptr d_beta; + std::unique_ptr d_prev_rho; + std::unique_ptr d_rho; + std::unique_ptr> d_stop_status; +}; + + +TEST_F(Cg, HipCgInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::cg::initialize(ref, b.get(), r.get(), z.get(), + p.get(), q.get(), prev_rho.get(), + rho.get(), stop_status.get()); + gko::kernels::hip::cg::initialize(hip, d_b.get(), d_r.get(), d_z.get(), + d_p.get(), d_q.get(), d_prev_rho.get(), + d_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Cg, HipCgStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::cg::step_1(ref, p.get(), z.get(), rho.get(), + prev_rho.get(), stop_status.get()); + gko::kernels::hip::cg::step_1(hip, d_p.get(), d_z.get(), d_rho.get(), + d_prev_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); +} + + +TEST_F(Cg, HipCgStep2IsEquivalentToRef) +{ + initialize_data(); + gko::kernels::reference::cg::step_2(ref, x.get(), r.get(), p.get(), q.get(), + beta.get(), rho.get(), + stop_status.get()); + gko::kernels::hip::cg::step_2(hip, d_x.get(), d_r.get(), d_p.get(), + d_q.get(), d_beta.get(), d_rho.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); +} + + +TEST_F(Cg, ApplyIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + make_spd(mtx.get()); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = Mtx::create(hip); + d_mtx->copy_from(mtx.get()); + auto d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + auto cg_factory = + gko::solver::Cg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_cg_factory = + gko::solver::Cg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(hip), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(hip)) + .on(hip); + auto solver = cg_factory->generate(std::move(mtx)); + auto d_solver = d_cg_factory->generate(std::move(d_mtx)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/hip/test/solver/cgs_kernels.cpp b/hip/test/solver/cgs_kernels.cpp new file mode 100644 index 00000000000..ff676c2dffc --- /dev/null +++ b/hip/test/solver/cgs_kernels.cpp @@ -0,0 +1,349 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/cgs_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Cgs : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + using Solver = gko::solver::Cgs<>; + + Cgs() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + + mtx = gen_mtx(123, 123); + make_diag_dominant(mtx.get()); + d_mtx = Mtx::create(hip); + d_mtx->copy_from(mtx.get()); + hip_cgs_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(246u).on(hip), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-15) + .on(hip)) + .on(hip); + ref_cgs_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(246u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-15) + .on(ref)) + .on(ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 43; + b = gen_mtx(m, n); + r = gen_mtx(m, n); + r_tld = gen_mtx(m, n); + p = gen_mtx(m, n); + q = gen_mtx(m, n); + u = gen_mtx(m, n); + u_hat = gen_mtx(m, n); + v_hat = gen_mtx(m, n); + t = gen_mtx(m, n); + x = gen_mtx(m, n); + alpha = gen_mtx(1, n); + beta = gen_mtx(1, n); + gamma = gen_mtx(1, n); + rho = gen_mtx(1, n); + rho_prev = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + + d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + d_r = Mtx::create(hip); + d_r->copy_from(r.get()); + d_r_tld = Mtx::create(hip); + d_r_tld->copy_from(r_tld.get()); + d_p = Mtx::create(hip); + d_p->copy_from(p.get()); + d_q = Mtx::create(hip); + d_q->copy_from(q.get()); + d_u = Mtx::create(hip); + d_u->copy_from(u.get()); + d_u_hat = Mtx::create(hip); + d_u_hat->copy_from(u_hat.get()); + d_v_hat = Mtx::create(hip); + d_v_hat->copy_from(v_hat.get()); + d_t = Mtx::create(hip); + d_t->copy_from(t.get()); + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_alpha = Mtx::create(hip); + d_alpha->copy_from(alpha.get()); + d_beta = Mtx::create(hip); + d_beta->copy_from(beta.get()); + d_gamma = Mtx::create(hip); + d_gamma->copy_from(gamma.get()); + d_rho_prev = Mtx::create(hip); + d_rho_prev->copy_from(rho_prev.get()); + d_rho = Mtx::create(hip); + d_rho->copy_from(rho.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(hip, n)); + // because there is no public function copy_from, use overloaded = + // operator + *d_stop_status = *stop_status; + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::shared_ptr mtx; + std::shared_ptr d_mtx; + std::unique_ptr hip_cgs_factory; + std::unique_ptr ref_cgs_factory; + + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr r_tld; + std::unique_ptr t; + std::unique_ptr p; + std::unique_ptr q; + std::unique_ptr u; + std::unique_ptr u_hat; + std::unique_ptr v_hat; + std::unique_ptr x; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr gamma; + std::unique_ptr rho; + std::unique_ptr rho_prev; + std::unique_ptr> stop_status; + + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_r_tld; + std::unique_ptr d_t; + std::unique_ptr d_p; + std::unique_ptr d_q; + std::unique_ptr d_u; + std::unique_ptr d_u_hat; + std::unique_ptr d_v_hat; + std::unique_ptr d_x; + std::unique_ptr d_alpha; + std::unique_ptr d_beta; + std::unique_ptr d_gamma; + std::unique_ptr d_rho; + std::unique_ptr d_rho_prev; + std::unique_ptr> d_stop_status; +}; + + +TEST_F(Cgs, HipCgsInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::cgs::initialize( + ref, b.get(), r.get(), r_tld.get(), p.get(), q.get(), u.get(), + u_hat.get(), v_hat.get(), t.get(), alpha.get(), beta.get(), gamma.get(), + rho_prev.get(), rho.get(), stop_status.get()); + gko::kernels::hip::cgs::initialize( + hip, d_b.get(), d_r.get(), d_r_tld.get(), d_p.get(), d_q.get(), + d_u.get(), d_u_hat.get(), d_v_hat.get(), d_t.get(), d_alpha.get(), + d_beta.get(), d_gamma.get(), d_rho_prev.get(), d_rho.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r_tld, r_tld, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_u, u, 1e-14); + GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14); + GKO_ASSERT_MTX_NEAR(d_u_hat, u_hat, 1e-14); + GKO_ASSERT_MTX_NEAR(d_v_hat, v_hat, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho_prev, rho_prev, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14); + GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14); + GKO_ASSERT_MTX_NEAR(d_gamma, gamma, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Cgs, HipCgsStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::cgs::step_1(ref, r.get(), u.get(), p.get(), + q.get(), beta.get(), rho.get(), + rho_prev.get(), stop_status.get()); + gko::kernels::hip::cgs::step_1(hip, d_r.get(), d_u.get(), d_p.get(), + d_q.get(), d_beta.get(), d_rho.get(), + d_rho_prev.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14); + GKO_ASSERT_MTX_NEAR(d_u, u, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); +} + + +TEST_F(Cgs, HipCgsStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::cgs::step_2(ref, u.get(), v_hat.get(), q.get(), + t.get(), alpha.get(), rho.get(), + gamma.get(), stop_status.get()); + gko::kernels::hip::cgs::step_2(hip, d_u.get(), d_v_hat.get(), d_q.get(), + d_t.get(), d_alpha.get(), d_rho.get(), + d_gamma.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14); + GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); +} + + +TEST_F(Cgs, HipCgsStep3IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::cgs::step_3(ref, t.get(), u_hat.get(), r.get(), + x.get(), alpha.get(), + stop_status.get()); + gko::kernels::hip::cgs::step_3(hip, d_t.get(), d_u_hat.get(), d_r.get(), + d_x.get(), d_alpha.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); +} + + +TEST_F(Cgs, HipCgsApplyOneRHSIsEquivalentToRef) +{ + int m = 123; + int n = 1; + auto ref_solver = ref_cgs_factory->generate(mtx); + auto hip_solver = hip_cgs_factory->generate(d_mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(hip); + auto d_x = Mtx::create(hip); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + hip_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + + +TEST_F(Cgs, HipCgsApplyMultipleRHSIsEquivalentToRef) +{ + int m = 123; + int n = 16; + auto hip_solver = hip_cgs_factory->generate(d_mtx); + auto ref_solver = ref_cgs_factory->generate(mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(hip); + auto d_x = Mtx::create(hip); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + hip_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + +} // namespace diff --git a/hip/test/solver/fcg_kernels.cpp b/hip/test/solver/fcg_kernels.cpp new file mode 100644 index 00000000000..7771cf9b03c --- /dev/null +++ b/hip/test/solver/fcg_kernels.cpp @@ -0,0 +1,285 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/fcg_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Fcg : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + using Solver = gko::solver::Fcg<>; + + Fcg() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 43; + b = gen_mtx(m, n); + r = gen_mtx(m, n); + t = gen_mtx(m, n); + z = gen_mtx(m, n); + p = gen_mtx(m, n); + q = gen_mtx(m, n); + x = gen_mtx(m, n); + beta = gen_mtx(1, n); + prev_rho = gen_mtx(1, n); + rho = gen_mtx(1, n); + rho_t = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + + d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + d_r = Mtx::create(hip); + d_r->copy_from(r.get()); + d_t = Mtx::create(hip); + d_t->copy_from(t.get()); + d_z = Mtx::create(hip); + d_z->copy_from(z.get()); + d_p = Mtx::create(hip); + d_p->copy_from(p.get()); + d_q = Mtx::create(hip); + d_q->copy_from(q.get()); + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_beta = Mtx::create(hip); + d_beta->copy_from(beta.get()); + d_prev_rho = Mtx::create(hip); + d_prev_rho->copy_from(prev_rho.get()); + d_rho_t = Mtx::create(hip); + d_rho_t->copy_from(rho_t.get()); + d_rho = Mtx::create(hip); + d_rho->copy_from(rho.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(hip, n)); + *d_stop_status = *stop_status; + } + + void make_symetric(Mtx *mtx) + { + for (int i = 0; i < mtx->get_size()[0]; ++i) { + for (int j = i + 1; j < mtx->get_size()[1]; ++j) { + mtx->at(i, j) = mtx->at(j, i); + } + } + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + void make_spd(Mtx *mtx) + { + make_symetric(mtx); + make_diag_dominant(mtx); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr t; + std::unique_ptr z; + std::unique_ptr p; + std::unique_ptr q; + std::unique_ptr x; + std::unique_ptr beta; + std::unique_ptr prev_rho; + std::unique_ptr rho; + std::unique_ptr rho_t; + std::unique_ptr> stop_status; + + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_t; + std::unique_ptr d_z; + std::unique_ptr d_p; + std::unique_ptr d_q; + std::unique_ptr d_x; + std::unique_ptr d_beta; + std::unique_ptr d_prev_rho; + std::unique_ptr d_rho; + std::unique_ptr d_rho_t; + std::unique_ptr> d_stop_status; +}; + + +TEST_F(Fcg, HipFcgInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::fcg::initialize( + ref, b.get(), r.get(), z.get(), p.get(), q.get(), t.get(), + prev_rho.get(), rho.get(), rho_t.get(), stop_status.get()); + gko::kernels::hip::fcg::initialize( + hip, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_t.get(), + d_prev_rho.get(), d_rho.get(), d_rho_t.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho_t, rho_t, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Fcg, HipFcgStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::fcg::step_1(ref, p.get(), z.get(), rho_t.get(), + prev_rho.get(), stop_status.get()); + gko::kernels::hip::fcg::step_1(hip, d_p.get(), d_z.get(), d_rho_t.get(), + d_prev_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); +} + + +TEST_F(Fcg, HipFcgStep2IsEquivalentToRef) +{ + initialize_data(); + gko::kernels::reference::fcg::step_2(ref, x.get(), r.get(), t.get(), + p.get(), q.get(), beta.get(), + rho.get(), stop_status.get()); + gko::kernels::hip::fcg::step_2(hip, d_x.get(), d_r.get(), d_t.get(), + d_p.get(), d_q.get(), d_beta.get(), + d_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_t, t, 1e-14); +} + + +TEST_F(Fcg, ApplyIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + make_spd(mtx.get()); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = Mtx::create(hip); + d_mtx->copy_from(mtx.get()); + auto d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + auto fcg_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_fcg_factory = + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(hip), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(hip)) + .on(hip); + auto solver = fcg_factory->generate(std::move(mtx)); + auto d_solver = d_fcg_factory->generate(std::move(d_mtx)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/hip/test/solver/gmres_kernels.cpp b/hip/test/solver/gmres_kernels.cpp new file mode 100644 index 00000000000..d16c781cb1e --- /dev/null +++ b/hip/test/solver/gmres_kernels.cpp @@ -0,0 +1,300 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/solver/gmres_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Gmres : public ::testing::Test { +protected: + using value_type = gko::default_precision; + using index_type = gko::int32; + using Mtx = gko::matrix::Dense; + using norm_type = gko::remove_complex; + using NormVector = gko::matrix::Dense; + template + using Dense = typename gko::matrix::Dense; + + Gmres() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + template + std::unique_ptr> gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix>( + num_rows, num_cols, + std::uniform_int_distribution(num_cols, num_cols), + std::normal_distribution(-1.0, 1.0), rand_engine, ref); + } + + + void initialize_data(int nrhs = 43) + { + int m = 597; + x = gen_mtx(m, nrhs); + y = gen_mtx(gko::solver::default_krylov_dim, nrhs); + before_preconditioner = Mtx::create_with_config_of(x.get()); + b = gen_mtx(m, nrhs); + krylov_bases = gen_mtx(m * (gko::solver::default_krylov_dim + 1), nrhs); + hessenberg = gen_mtx(gko::solver::default_krylov_dim + 1, + gko::solver::default_krylov_dim * nrhs); + hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, nrhs); + residual = gen_mtx(m, nrhs); + residual_norm = gen_mtx(1, nrhs); + residual_norm_collection = + gen_mtx(gko::solver::default_krylov_dim + 1, nrhs); + givens_sin = gen_mtx(gko::solver::default_krylov_dim, nrhs); + givens_cos = gen_mtx(gko::solver::default_krylov_dim, nrhs); + stop_status = std::unique_ptr>( + new gko::Array(ref, nrhs)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + final_iter_nums = std::unique_ptr>( + new gko::Array(ref, nrhs)); + for (size_t i = 0; i < final_iter_nums->get_num_elems(); ++i) { + final_iter_nums->get_data()[i] = 5; + } + + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_before_preconditioner = Mtx::create_with_config_of(d_x.get()); + d_y = Mtx::create(hip); + d_y->copy_from(y.get()); + d_b = Mtx::create(hip); + d_b->copy_from(b.get()); + d_krylov_bases = Mtx::create(hip); + d_krylov_bases->copy_from(krylov_bases.get()); + d_hessenberg = Mtx::create(hip); + d_hessenberg->copy_from(hessenberg.get()); + d_hessenberg_iter = Mtx::create(hip); + d_hessenberg_iter->copy_from(hessenberg_iter.get()); + d_residual = Mtx::create(hip); + d_residual->copy_from(residual.get()); + d_residual_norm = NormVector::create(hip); + d_residual_norm->copy_from(residual_norm.get()); + d_residual_norm_collection = Mtx::create(hip); + d_residual_norm_collection->copy_from(residual_norm_collection.get()); + d_givens_sin = Mtx::create(hip); + d_givens_sin->copy_from(givens_sin.get()); + d_givens_cos = Mtx::create(hip); + d_givens_cos->copy_from(givens_cos.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(hip, nrhs)); + *d_stop_status = *stop_status; + d_final_iter_nums = std::unique_ptr>( + new gko::Array(hip, nrhs)); + *d_final_iter_nums = *final_iter_nums; + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; + + std::unique_ptr before_preconditioner; + std::unique_ptr x; + std::unique_ptr y; + std::unique_ptr b; + std::unique_ptr krylov_bases; + std::unique_ptr hessenberg; + std::unique_ptr hessenberg_iter; + std::unique_ptr residual; + std::unique_ptr residual_norm; + std::unique_ptr residual_norm_collection; + std::unique_ptr givens_sin; + std::unique_ptr givens_cos; + std::unique_ptr> stop_status; + std::unique_ptr> final_iter_nums; + + std::unique_ptr d_x; + std::unique_ptr d_before_preconditioner; + std::unique_ptr d_y; + std::unique_ptr d_b; + std::unique_ptr d_krylov_bases; + std::unique_ptr d_hessenberg; + std::unique_ptr d_hessenberg_iter; + std::unique_ptr d_residual; + std::unique_ptr d_residual_norm; + std::unique_ptr d_residual_norm_collection; + std::unique_ptr d_givens_sin; + std::unique_ptr d_givens_cos; + std::unique_ptr> d_stop_status; + std::unique_ptr> d_final_iter_nums; +}; + + +TEST_F(Gmres, HipGmresInitialize1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::gmres::initialize_1( + ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), + stop_status.get(), gko::solver::default_krylov_dim); + gko::kernels::hip::gmres::initialize_1( + hip, d_b.get(), d_residual.get(), d_givens_sin.get(), + d_givens_cos.get(), d_stop_status.get(), + gko::solver::default_krylov_dim); + + GKO_ASSERT_MTX_NEAR(d_residual, residual, 1e-14); + GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); + GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Gmres, HipGmresInitialize2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::gmres::initialize_2( + ref, residual.get(), residual_norm.get(), + residual_norm_collection.get(), krylov_bases.get(), + final_iter_nums.get(), gko::solver::default_krylov_dim); + gko::kernels::hip::gmres::initialize_2( + hip, d_residual.get(), d_residual_norm.get(), + d_residual_norm_collection.get(), d_krylov_bases.get(), + d_final_iter_nums.get(), gko::solver::default_krylov_dim); + + GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, + 1e-14); + GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); +} + + +TEST_F(Gmres, HipGmresStep1IsEquivalentToRef) +{ + initialize_data(); + int iter = 5; + + gko::kernels::reference::gmres::step_1( + ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), + residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), + hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); + gko::kernels::hip::gmres::step_1( + hip, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), + d_residual_norm.get(), d_residual_norm_collection.get(), + d_krylov_bases.get(), d_hessenberg_iter.get(), iter, + d_final_iter_nums.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); + GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, + 1e-14); + GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14); + GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); +} + + +TEST_F(Gmres, HipGmresStep1OnSingleRHSIsEquivalentToRef) +{ + initialize_data(1); + int iter = 5; + + gko::kernels::reference::gmres::step_1( + ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), + residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), + hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); + gko::kernels::hip::gmres::step_1( + hip, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), + d_residual_norm.get(), d_residual_norm_collection.get(), + d_krylov_bases.get(), d_hessenberg_iter.get(), iter, + d_final_iter_nums.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); + GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14); + GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, + 1e-14); + GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14); + GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); +} + + +TEST_F(Gmres, HipGmresStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::gmres::step_2(ref, residual_norm_collection.get(), + krylov_bases.get(), hessenberg.get(), + y.get(), before_preconditioner.get(), + final_iter_nums.get()); + gko::kernels::hip::gmres::step_2(hip, d_residual_norm_collection.get(), + d_krylov_bases.get(), d_hessenberg.get(), + d_y.get(), d_before_preconditioner.get(), + d_final_iter_nums.get()); + + GKO_ASSERT_MTX_NEAR(d_y, y, 1e-14); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/hip/test/solver/ir_kernels.cpp b/hip/test/solver/ir_kernels.cpp new file mode 100644 index 00000000000..0e5791cd7cf --- /dev/null +++ b/hip/test/solver/ir_kernels.cpp @@ -0,0 +1,259 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/ir_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Ir : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + Ir() : rand_engine(30) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + std::shared_ptr ref; + std::shared_ptr hip; + + std::ranlux48 rand_engine; +}; + + +TEST_F(Ir, InitializeIsEquivalentToRef) +{ + auto stop_status = gko::Array(ref, 43); + for (size_t i = 0; i < stop_status.get_num_elems(); ++i) { + stop_status.get_data()[i].reset(); + } + auto d_stop_status = gko::Array(hip, stop_status); + + gko::kernels::reference::ir::initialize(ref, &stop_status); + gko::kernels::hip::ir::initialize(hip, &d_stop_status); + + auto tmp = gko::Array(ref, d_stop_status); + for (int i = 0; i < stop_status.get_num_elems(); ++i) { + ASSERT_EQ(stop_status.get_const_data()[i], tmp.get_const_data()[i]); + } +} + + +TEST_F(Ir, ApplyIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(hip, mtx); + auto d_x = clone(hip, x); + auto d_b = clone(hip, b); + // Forget about accuracy - Richardson is not going to converge for a random + // matrix, just check that a couple of iterations gives the same result on + // both executors + auto ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(hip)) + .on(hip); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(hip, mtx); + auto d_x = clone(hip, x); + auto d_b = clone(hip, b); + + auto ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + ref)) + .on(ref)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + hip)) + .on(hip)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(hip)) + .on(hip); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres + // iteration gets amplified by the difference in IR. + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Ir, RichardsonApplyIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(hip, mtx); + auto d_x = clone(hip, x); + auto d_b = clone(hip, b); + // Forget about accuracy - Richardson is not going to converge for a random + // matrix, just check that a couple of iterations gives the same result on + // both executors + auto ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_relaxation_factor(0.9) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(hip)) + .with_relaxation_factor(0.9) + .on(hip); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(hip, mtx); + auto d_x = clone(hip, x); + auto d_b = clone(hip, b); + auto ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + ref)) + .on(ref)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_relaxation_factor(0.9) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + hip)) + .on(hip)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(hip)) + .with_relaxation_factor(0.9) + .on(hip); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres + // iteration gets amplified by the difference in IR. + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +} // namespace diff --git a/hip/test/solver/lower_trs_kernels.cpp b/hip/test/solver/lower_trs_kernels.cpp new file mode 100644 index 00000000000..b497b525020 --- /dev/null +++ b/hip/test/solver/lower_trs_kernels.cpp @@ -0,0 +1,167 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/solver/lower_trs_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class LowerTrs : public ::testing::Test { +protected: + using CsrMtx = gko::matrix::Csr; + using Mtx = gko::matrix::Dense<>; + + LowerTrs() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + std::unique_ptr gen_l_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_lower_triangular_matrix( + num_rows, num_cols, false, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void initialize_data(int m, int n) + { + mtx = gen_l_mtx(m, m); + b = gen_mtx(m, n); + x = gen_mtx(m, n); + csr_mtx = CsrMtx::create(ref); + mtx->convert_to(csr_mtx.get()); + d_csr_mtx = CsrMtx::create(hip); + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_csr_mtx->copy_from(csr_mtx.get()); + b2 = Mtx::create(ref); + d_b2 = Mtx::create(hip); + d_b2->copy_from(b.get()); + b2->copy_from(b.get()); + } + + std::shared_ptr b; + std::shared_ptr b2; + std::shared_ptr x; + std::shared_ptr mtx; + std::shared_ptr csr_mtx; + std::shared_ptr d_b; + std::shared_ptr d_b2; + std::shared_ptr d_x; + std::shared_ptr d_csr_mtx; + std::shared_ptr ref; + std::shared_ptr hip; + std::ranlux48 rand_engine; +}; + + +TEST_F(LowerTrs, HipLowerTrsFlagCheckIsCorrect) +{ + bool trans_flag = false; + bool expected_flag = true; + gko::kernels::hip::lower_trs::should_perform_transpose(hip, trans_flag); + + ASSERT_EQ(expected_flag, trans_flag); +} + + +TEST_F(LowerTrs, HipSingleRhsApplyIsEquivalentToRef) +{ + initialize_data(50, 1); + auto lower_trs_factory = gko::solver::LowerTrs<>::build().on(ref); + auto d_lower_trs_factory = gko::solver::LowerTrs<>::build().on(hip); + auto solver = lower_trs_factory->generate(csr_mtx); + auto d_solver = d_lower_trs_factory->generate(d_csr_mtx); + + solver->apply(b2.get(), x.get()); + d_solver->apply(d_b2.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(LowerTrs, HipMultipleRhsApplyIsEquivalentToRef) +{ + initialize_data(50, 3); + auto lower_trs_factory = + gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref); + auto d_lower_trs_factory = + gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(hip); + auto solver = lower_trs_factory->generate(csr_mtx); + auto d_solver = d_lower_trs_factory->generate(d_csr_mtx); + + solver->apply(b2.get(), x.get()); + d_solver->apply(d_b2.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/hip/test/solver/upper_trs_kernels.cpp b/hip/test/solver/upper_trs_kernels.cpp new file mode 100644 index 00000000000..ba55bc6325c --- /dev/null +++ b/hip/test/solver/upper_trs_kernels.cpp @@ -0,0 +1,167 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/solver/upper_trs_kernels.hpp" +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class UpperTrs : public ::testing::Test { +protected: + using CsrMtx = gko::matrix::Csr; + using Mtx = gko::matrix::Dense<>; + + UpperTrs() : rand_engine(30) {} + + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + std::unique_ptr gen_u_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_upper_triangular_matrix( + num_rows, num_cols, false, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void initialize_data(int m, int n) + { + mtx = gen_u_mtx(m, m); + b = gen_mtx(m, n); + x = gen_mtx(m, n); + csr_mtx = CsrMtx::create(ref); + mtx->convert_to(csr_mtx.get()); + d_csr_mtx = CsrMtx::create(hip); + d_x = Mtx::create(hip); + d_x->copy_from(x.get()); + d_csr_mtx->copy_from(csr_mtx.get()); + b2 = Mtx::create(ref); + d_b2 = Mtx::create(hip); + d_b2->copy_from(b.get()); + b2->copy_from(b.get()); + } + + std::shared_ptr b; + std::shared_ptr b2; + std::shared_ptr x; + std::shared_ptr mtx; + std::shared_ptr csr_mtx; + std::shared_ptr d_b; + std::shared_ptr d_b2; + std::shared_ptr d_x; + std::shared_ptr d_csr_mtx; + std::shared_ptr ref; + std::shared_ptr hip; + std::ranlux48 rand_engine; +}; + + +TEST_F(UpperTrs, HipUpperTrsFlagCheckIsCorrect) +{ + bool trans_flag = false; + bool expected_flag = true; + gko::kernels::hip::upper_trs::should_perform_transpose(hip, trans_flag); + + ASSERT_EQ(expected_flag, trans_flag); +} + + +TEST_F(UpperTrs, HipSingleRhsApplyIsEquivalentToRef) +{ + initialize_data(50, 1); + auto upper_trs_factory = gko::solver::UpperTrs<>::build().on(ref); + auto d_upper_trs_factory = gko::solver::UpperTrs<>::build().on(hip); + auto solver = upper_trs_factory->generate(csr_mtx); + auto d_solver = d_upper_trs_factory->generate(d_csr_mtx); + + solver->apply(b2.get(), x.get()); + d_solver->apply(d_b2.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(UpperTrs, HipMultipleRhsApplyIsEquivalentToRef) +{ + initialize_data(50, 3); + auto upper_trs_factory = + gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref); + auto d_upper_trs_factory = + gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(hip); + auto solver = upper_trs_factory->generate(csr_mtx); + auto d_solver = d_upper_trs_factory->generate(d_csr_mtx); + + solver->apply(b2.get(), x.get()); + d_solver->apply(d_b2.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/hip/test/stop/CMakeLists.txt b/hip/test/stop/CMakeLists.txt new file mode 100644 index 00000000000..844f7037768 --- /dev/null +++ b/hip/test/stop/CMakeLists.txt @@ -0,0 +1,2 @@ +ginkgo_create_hip_test(criterion_kernels) +ginkgo_create_hip_test_special_linkage(residual_norm_kernels) diff --git a/hip/test/stop/criterion_kernels.hip.cpp b/hip/test/stop/criterion_kernels.hip.cpp new file mode 100644 index 00000000000..92935ea4867 --- /dev/null +++ b/hip/test/stop/criterion_kernels.hip.cpp @@ -0,0 +1,111 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +constexpr gko::size_type test_iterations = 10; + + +class Criterion : public ::testing::Test { +protected: + Criterion() + { + ref_ = gko::ReferenceExecutor::create(); + hip_ = gko::HipExecutor::create(0, ref_); + // Actually use an iteration stopping criterion because Criterion is an + // abstract class + factory_ = gko::stop::Iteration::build() + .with_max_iters(test_iterations) + .on(hip_); + } + + std::unique_ptr factory_; + std::shared_ptr ref_; + std::shared_ptr hip_; +}; + + +TEST_F(Criterion, SetsOneStopStatus) +{ + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + auto criterion = factory_->generate(nullptr, nullptr, nullptr); + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + + stop_status.set_executor(hip_); + criterion->update() + .num_iterations(test_iterations) + .check(RelativeStoppingId, true, &stop_status, &one_changed); + stop_status.set_executor(ref_); + + ASSERT_EQ(stop_status.get_data()[0].has_stopped(), true); +} + + +TEST_F(Criterion, SetsMultipleStopStatuses) +{ + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + auto criterion = factory_->generate(nullptr, nullptr, nullptr); + gko::Array stop_status(ref_, 3); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.get_data()[2].reset(); + + stop_status.set_executor(hip_); + criterion->update() + .num_iterations(test_iterations) + .check(RelativeStoppingId, true, &stop_status, &one_changed); + stop_status.set_executor(ref_); + + ASSERT_EQ(stop_status.get_data()[0].has_stopped(), true); + ASSERT_EQ(stop_status.get_data()[1].has_stopped(), true); + ASSERT_EQ(stop_status.get_data()[2].has_stopped(), true); +} + + +} // namespace diff --git a/hip/test/stop/residual_norm_kernels.cpp b/hip/test/stop/residual_norm_kernels.cpp new file mode 100644 index 00000000000..42c505da601 --- /dev/null +++ b/hip/test/stop/residual_norm_kernels.cpp @@ -0,0 +1,369 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +constexpr double tol = 1.0e-14; + + +class ResidualNormReduction : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + + ResidualNormReduction() + { + ref_ = gko::ReferenceExecutor::create(); + hip_ = gko::HipExecutor::create(0, ref_); + factory_ = gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(tol) + .on(hip_); + } + + std::unique_ptr::Factory> factory_; + std::shared_ptr hip_; + std::shared_ptr ref_; +}; + + +TEST_F(ResidualNormReduction, WaitsTillResidualGoal) +{ + auto res = gko::initialize({100.0}, ref_); + auto d_res = Mtx::create(hip_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = gko::initialize({10.0}, ref_); + std::shared_ptr d_rhs = Mtx::create(hip_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + stop_status.set_executor(hip_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0) = tol * 1.1e+2; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_FALSE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(hip_); + ASSERT_FALSE(one_changed); + + res->at(0) = tol * 0.9e+2; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + ASSERT_TRUE(one_changed); +} + + +TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) +{ + auto res = gko::initialize({{100.0, 100.0}}, ref_); + auto d_res = Mtx::create(hip_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = + gko::initialize({{10.0, 10.0}}, ref_); + std::shared_ptr d_rhs = Mtx::create(hip_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.set_executor(hip_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0, 0) = tol * 0.9e+2; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(hip_); + ASSERT_TRUE(one_changed); + + res->at(0, 1) = tol * 0.9e+2; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[1].has_converged()); + ASSERT_TRUE(one_changed); +} + + +class RelativeResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + + RelativeResidualNorm() + { + ref_ = gko::ReferenceExecutor::create(); + hip_ = gko::HipExecutor::create(0, ref_); + factory_ = + gko::stop::RelativeResidualNorm<>::build().with_tolerance(tol).on( + hip_); + } + + std::unique_ptr::Factory> factory_; + std::shared_ptr hip_; + std::shared_ptr ref_; +}; + + +TEST_F(RelativeResidualNorm, WaitsTillResidualGoal) +{ + auto res = gko::initialize({100.0}, ref_); + auto d_res = Mtx::create(hip_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = gko::initialize({10.0}, ref_); + std::shared_ptr d_rhs = Mtx::create(hip_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + stop_status.set_executor(hip_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0) = tol * 1.1e+1; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_FALSE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(hip_); + ASSERT_FALSE(one_changed); + + res->at(0) = tol * 0.9e+1; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + ASSERT_TRUE(one_changed); +} + + +TEST_F(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + auto res = gko::initialize({{100.0, 100.0}}, ref_); + auto d_res = Mtx::create(hip_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = + gko::initialize({{10.0, 10.0}}, ref_); + std::shared_ptr d_rhs = Mtx::create(hip_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.set_executor(hip_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0, 0) = tol * 0.9e+1; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(hip_); + ASSERT_TRUE(one_changed); + + res->at(0, 1) = tol * 0.9e+1; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[1].has_converged()); + ASSERT_TRUE(one_changed); +} + + +class AbsoluteResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + + AbsoluteResidualNorm() + { + ref_ = gko::ReferenceExecutor::create(); + hip_ = gko::HipExecutor::create(0, ref_); + factory_ = + gko::stop::AbsoluteResidualNorm<>::build().with_tolerance(tol).on( + hip_); + } + + std::unique_ptr::Factory> factory_; + std::shared_ptr hip_; + std::shared_ptr ref_; +}; + + +TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoal) +{ + auto res = gko::initialize({100.0}, ref_); + auto d_res = Mtx::create(hip_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = gko::initialize({10.0}, ref_); + std::shared_ptr d_rhs = Mtx::create(hip_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 1); + stop_status.get_data()[0].reset(); + stop_status.set_executor(hip_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0) = tol * 1.1; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_FALSE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(hip_); + ASSERT_FALSE(one_changed); + + res->at(0) = tol * 0.9; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + ASSERT_TRUE(one_changed); +} + + +TEST_F(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + auto res = gko::initialize({{100.0, 100.0}}, ref_); + auto d_res = Mtx::create(hip_); + d_res->copy_from(res.get()); + std::shared_ptr rhs = + gko::initialize({{10.0, 10.0}}, ref_); + std::shared_ptr d_rhs = Mtx::create(hip_); + d_rhs->copy_from(rhs.get()); + auto criterion = factory_->generate(nullptr, d_rhs, nullptr, d_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(ref_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + stop_status.set_executor(hip_); + + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res->at(0, 0) = tol * 0.9; + d_res->copy_from(res.get()); + ASSERT_FALSE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[0].has_converged()); + stop_status.set_executor(hip_); + ASSERT_TRUE(one_changed); + + res->at(0, 1) = tol * 0.9; + d_res->copy_from(res.get()); + ASSERT_TRUE( + criterion->update() + .residual_norm(d_res.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + stop_status.set_executor(ref_); + ASSERT_TRUE(stop_status.get_data()[1].has_converged()); + ASSERT_TRUE(one_changed); +} + + +} // namespace diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp new file mode 100644 index 00000000000..8d524b0b615 --- /dev/null +++ b/hip/test/utils.hip.hpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_TEST_UTILS_HIP_HPP_ +#define GKO_HIP_TEST_UTILS_HIP_HPP_ + + +#include "core/test/utils.hpp" + + +#include + + +namespace { + + +// prevent device reset after each test +auto no_reset_exec = + gko::HipExecutor::create(0, gko::ReferenceExecutor::create(), true); + + +} // namespace + + +#endif // GKO_HIP_TEST_UTILS_HIP_HPP_ diff --git a/hip/test/utils/CMakeLists.txt b/hip/test/utils/CMakeLists.txt new file mode 100644 index 00000000000..a6c52f65d9c --- /dev/null +++ b/hip/test/utils/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_hip_test(assertions_test) diff --git a/hip/test/utils/assertions_test.hip.cpp b/hip/test/utils/assertions_test.hip.cpp new file mode 100644 index 00000000000..2d5c67addc1 --- /dev/null +++ b/hip/test/utils/assertions_test.hip.cpp @@ -0,0 +1,84 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/test/utils/assertions.hpp" + + +#include + + +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class MatricesNear : public ::testing::Test { +protected: + void SetUp() + { + ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + hip = gko::HipExecutor::create(0, ref); + } + + void TearDown() + { + if (hip != nullptr) { + ASSERT_NO_THROW(hip->synchronize()); + } + } + + std::shared_ptr ref; + std::shared_ptr hip; +}; + + +TEST_F(MatricesNear, CanPassHipMatrix) +{ + auto mtx = gko::initialize>( + {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, ref); + auto csr_ref = gko::matrix::Csr<>::create(ref); + csr_ref->copy_from(mtx.get()); + auto csr_mtx = gko::matrix::Csr<>::create(hip); + csr_mtx->copy_from(std::move(csr_ref)); + + GKO_EXPECT_MTX_NEAR(csr_mtx, mtx, 0.0); + GKO_ASSERT_MTX_NEAR(csr_mtx, mtx, 0.0); +} + + +} // namespace diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt new file mode 100644 index 00000000000..004b7b359ad --- /dev/null +++ b/include/CMakeLists.txt @@ -0,0 +1,6 @@ +if (GINKGO_CHECK_CIRCULAR_DEPS) + add_library(ginkgo_public_api INTERFACE) # dummy target + set_property(TARGET ginkgo_public_api APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}") + set_property(TARGET ginkgo_public_api APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}") + ginkgo_check_headers(ginkgo_public_api) +endif() diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index aeb8cb30712..fc5bae0b225 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,6 +59,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS +/* What is HIP compiled for, hcc or nvcc? */ +// clang-format off +#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@ + + +#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@ +// clang-format on + + /* Is PAPI SDE available for Logging? */ // clang-format off #define GKO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@ diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 58a3407d3fc..2db193027a1 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index 1084ad324bb..5d53eeb58bb 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,15 +30,18 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_BASE_ARRAY_H_ -#define GKO_CORE_BASE_ARRAY_H_ +#ifndef GKO_CORE_BASE_ARRAY_HPP_ +#define GKO_CORE_BASE_ARRAY_HPP_ +#include +#include #include #include #include +#include #include #include #include @@ -47,6 +50,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace detail { + + +/** + * @internal + * + * Converts `size` elements of type `SourceType` stored at `src` on `exec` + * to `TargetType` stored at `dst`. + */ +template +void convert_data(std::shared_ptr exec, size_type size, + const SourceType *src, TargetType *dst); + + +} // namespace detail + + /** * An Array is a container which encapsulates fixed-sized arrays, stored on the * Executor tied to the Array. @@ -178,11 +198,8 @@ class Array { RandomAccessIterator end) : Array(exec) { - Array tmp(exec->get_master(), end - begin); - int i = 0; - for (auto it = begin; it != end; ++it, ++i) { - tmp.data_[i] = *it; - } + Array tmp(exec->get_master(), std::distance(begin, end)); + std::copy(begin, end, tmp.data_.get()); *this = std::move(tmp); } @@ -255,7 +272,8 @@ class Array { * Creates an Array from existing memory. * * The Array does not take ownership of the memory, and will not deallocate - * it once it goes out of scope. + * it once it goes out of scope. This array type cannot use the function + * `resize_and_reset` since it does not own the data it should resize. * * @param exec executor where `data` is located * @param num_elems number of elements in `data` @@ -270,7 +288,10 @@ class Array { } /** - * Copies data from another array. + * Copies data from another array or view. In the case of an array target, + * the array is resized to match the source's size. In the case of a view + * target, if the dimensions are not compatible a gko::OutOfBoundsError is + * thrown. * * This does not invoke the constructors of the elements, instead they are * copied as POD types. @@ -292,17 +313,39 @@ class Array { data_ = data_manager{nullptr, other.data_.get_deleter()}; } if (other.get_executor() == nullptr) { - this->resize_and_reset(0); + this->clear(); return *this; } - this->resize_and_reset(other.get_num_elems()); - exec_->copy_from(other.get_executor().get(), num_elems_, + + if (this->is_owning()) { + this->resize_and_reset(other.get_num_elems()); + } else { + GKO_ENSURE_COMPATIBLE_BOUNDS(other.get_num_elems(), + this->num_elems_); + } + exec_->copy_from(other.get_executor().get(), other.get_num_elems(), other.get_const_data(), this->get_data()); return *this; } /** - * Moves data from another array. + * Moves data from another array or view. Only the pointer and deleter type + * change, a copy only happens when targeting another executor's data. This + * means that in the following situation: + * ```cpp + * gko::Array a; // an existing array or view + * gko::Array b; // an existing array or view + * b = std::move(a); + * ``` + * Depending on whether `a` and `b` are array or view, this happens: + * + `a` and `b` are views, `b` becomes the only valid view of `a`; + * + `a` and `b` are arrays, `b` becomes the only valid array of `a`; + * + `a` is a view and `b` is an array, `b` frees its data and becomes the + * only valid view of `a` (); + * + `a` is an array and `b` is a view, `b` becomes the only valid array + * of `a`. + * + * In all the previous cases, `a` becomes invalid (e.g., a `nullptr`). * * This does not invoke the constructors of the elements, instead they are * copied as POD types. @@ -324,22 +367,70 @@ class Array { data_ = data_manager{nullptr, other.data_.get_deleter()}; } if (other.get_executor() == nullptr) { - this->resize_and_reset(0); + this->clear(); return *this; } - if (exec_ == other.get_executor() && - data_.get_deleter().target_type() != typeid(view_deleter)) { - // same device and not a view, only move the pointer + if (exec_ == other.get_executor()) { + // same device, only move the pointer using std::swap; swap(data_, other.data_); swap(num_elems_, other.num_elems_); + other.clear(); } else { - // different device or a view, copy the data + // different device, copy the data *this = other; } return *this; } + /** + * Copies and converts data from another array with another data type. + * In the case of an array target, the array is resized to match the + * source's size. In the case of a view target, if the dimensions are not + * compatible a gko::OutOfBoundsError is thrown. + * + * This does not invoke the constructors of the elements, instead they are + * copied as POD types. + * + * The executor of this is preserved. In case this does not have an assigned + * executor, it will inherit the executor of other. + * + * @param other the Array to copy from + * @tparam OtherValueType the value type of `other` + * + * @return this + */ + template + xstd::enable_if_t::value, Array> + &operator=(const Array &other) + { + if (this->exec_ == nullptr) { + this->exec_ = other.get_executor(); + this->data_ = data_manager{nullptr, default_deleter{this->exec_}}; + } + if (other.get_executor() == nullptr) { + this->clear(); + return *this; + } + + if (this->is_owning()) { + this->resize_and_reset(other.get_num_elems()); + } else { + GKO_ENSURE_COMPATIBLE_BOUNDS(other.get_num_elems(), + this->num_elems_); + } + Array tmp{this->exec_}; + const OtherValueType *source = other.get_const_data(); + // if we are on different executors: copy, then convert + if (this->exec_ != other.get_executor()) { + tmp = other; + source = tmp.get_const_data(); + } + detail::convert_data(this->exec_, other.get_num_elems(), source, + this->get_data()); + return *this; + } + /** * Deallocates all data used by the Array. * @@ -355,6 +446,8 @@ class Array { /** * Resizes the array so it is able to hold the specified number of elements. + * For a view and other non-owning Array types, this throws an exception + * since these types cannot be resized. * * All data stored in the array will be lost. * @@ -372,11 +465,16 @@ class Array { throw gko::NotSupported(__FILE__, __LINE__, __func__, "gko::Executor (nullptr)"); } - num_elems_ = num_elems; - if (num_elems > 0) { + if (!this->is_owning()) { + throw gko::NotSupported(__FILE__, __LINE__, __func__, + "Non owning gko::Array cannot be resized."); + } + + if (num_elems > 0 && this->is_owning()) { + num_elems_ = num_elems; data_.reset(exec_->alloc(num_elems)); } else { - data_.reset(nullptr); + this->clear(); } } @@ -433,7 +531,28 @@ class Array { data_ = std::move(tmp.data_); } + /** + * Tells whether this Array owns its data or not. + * + * Views do not own their data and this has multiple implications. They + * cannot be resized since the data is not owned by the Array which stores a + * view. It is also unclear whether custom deleter types are owning types as + * they could be a user-created view-type, therefore only proper Array which + * use the `default_deleter` are considered owning types. + * + * @return whether this Array can be resized or not. + */ + bool is_owning() + { + return data_.get_deleter().target_type() == typeid(default_deleter); + } + + private: + // Allow other Array types to access private members + template + friend class Array; + using data_manager = std::unique_ptr>; @@ -446,4 +565,4 @@ class Array { } // namespace gko -#endif // GKO_CORE_BASE_ARRAY_H_ +#endif // GKO_CORE_BASE_ARRAY_HPP_ diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp index 330a11f8cd3..908013a3e57 100644 --- a/include/ginkgo/core/base/combination.hpp +++ b/include/ginkgo/core/base/combination.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -53,12 +53,14 @@ namespace gko { */ template class Combination : public EnableLinOp>, - public EnableCreateMethod> { + public EnableCreateMethod>, + public Transposable { friend class EnablePolymorphicObject; friend class EnableCreateMethod; public: using value_type = ValueType; + using transposed_type = Combination; /** * Returns a list of coefficients of the combination. @@ -82,6 +84,10 @@ class Combination : public EnableLinOp>, return operators_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + protected: /** * Creates an empty linear combination (0x0 operator). diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp index 740c3e235f9..4a7ecc7874f 100644 --- a/include/ginkgo/core/base/composition.hpp +++ b/include/ginkgo/core/base/composition.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -48,18 +48,27 @@ namespace gko { * The Composition class can be used to compose linear operators `op1, op2, ..., * opn` and obtain the operator `op1 * op2 * ... * opn`. * + * All LinOps of the Composition must operate on Dense inputs. + * For an operator `op_k` that require an initial guess for their `apply`, + * Composition provides either + * * the output of the previous `op_{k+1}->apply` if `op_k` has square dimension + * * zero if `op_k` is rectangular + * as an initial guess. + * * @tparam ValueType precision of input and result vectors * * @ingroup LinOp */ template class Composition : public EnableLinOp>, - public EnableCreateMethod> { + public EnableCreateMethod>, + public Transposable { friend class EnablePolymorphicObject; friend class EnableCreateMethod; public: using value_type = ValueType; + using transposed_type = Composition; /** * Returns a list of operators of the composition. @@ -72,6 +81,10 @@ class Composition : public EnableLinOp>, return operators_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + protected: /** * Creates an empty operator composition (0x0 operator). @@ -79,7 +92,7 @@ class Composition : public EnableLinOp>, * @param exec Executor associated to the composition */ explicit Composition(std::shared_ptr exec) - : EnableLinOp(exec) + : EnableLinOp(exec), storage_{exec} {} /** @@ -101,6 +114,7 @@ class Composition : public EnableLinOp>, } return (*begin)->get_executor(); }()), + storage_{(*begin)->get_executor()}, operators_(begin, end) { this->set_size(gko::dim<2>{operators_.front()->get_size()[0], @@ -138,7 +152,8 @@ class Composition : public EnableLinOp>, */ explicit Composition(std::shared_ptr oper) : EnableLinOp(oper->get_executor(), oper->get_size()), - operators_{oper} + operators_{oper}, + storage_{oper->get_executor()} {} void apply_impl(const LinOp *b, LinOp *x) const override; @@ -148,18 +163,7 @@ class Composition : public EnableLinOp>, private: std::vector> operators_; - - // TODO: solve race conditions when multithreading - mutable struct cache_struct { - cache_struct() = default; - ~cache_struct() = default; - cache_struct(const cache_struct &other) {} - cache_struct &operator=(const cache_struct &other) { return *this; } - - // TODO: reduce the amount of intermediate vectors we need (careful -- - // not all of them are of the same size) - std::vector> intermediate; - } cache_; + mutable Array storage_; }; diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index 8c0155aee44..c0256df30dc 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -97,7 +97,8 @@ struct dim { constexpr GKO_ATTRIBUTES const dimension_type &operator[]( const size_type &dimension) const noexcept { - return GKO_ASSERT(dimension < dimensionality), *(&first_ + dimension); + return GKO_ASSERT(dimension < dimensionality), + dimension == 0 ? first_ : rest_[dimension - 1]; } /** @@ -106,7 +107,8 @@ struct dim { GKO_ATTRIBUTES dimension_type &operator[]( const size_type &dimension) noexcept { - return GKO_ASSERT(dimension < dimensionality), *(&first_ + dimension); + return GKO_ASSERT(dimension < dimensionality), + dimension == 0 ? first_ : rest_[dimension - 1]; } /** @@ -173,12 +175,12 @@ struct dim<1u, DimensionType> { constexpr GKO_ATTRIBUTES const dimension_type &operator[]( const size_type &dimension) const noexcept { - return *(&first_ + dimension); + return GKO_ASSERT(dimension == 0), first_; } GKO_ATTRIBUTES dimension_type &operator[](const size_type &dimension) { - return *(&first_ + dimension); + return GKO_ASSERT(dimension == 0), first_; } constexpr GKO_ATTRIBUTES operator bool() const diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 1855a9dc5c7..78fe81a617e 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,17 +30,17 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_EXCEPTION_HPP_ -#define GKO_CORE_EXCEPTION_HPP_ - - -#include +#ifndef GKO_CORE_BASE_EXCEPTION_HPP_ +#define GKO_CORE_BASE_EXCEPTION_HPP_ #include #include +#include + + namespace gko { @@ -88,9 +88,9 @@ class Error : public std::exception { /** * Initializes an error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param what The error message + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param what The error message */ Error(const std::string &file, int line, const std::string &what) : what_(file + ":" + std::to_string(line) + ": " + what) @@ -116,9 +116,9 @@ class NotImplemented : public Error { /** * Initializes a NotImplemented error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the not-yet implemented function + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the not-yet implemented function */ NotImplemented(const std::string &file, int line, const std::string &func) : Error(file, line, func + " is not implemented") @@ -135,10 +135,10 @@ class NotCompiled : public Error { /** * Initializes a NotCompiled error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the function that has not been compiled - * @param module The name of the module which contains the function + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the function that has not been compiled + * @param module The name of the module which contains the function */ NotCompiled(const std::string &file, int line, const std::string &func, const std::string &module) @@ -158,10 +158,10 @@ class NotSupported : public Error { /** * Initializes a NotSupported error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the function where the error occured - * @param obj_type The object type on which the requested operation + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the function where the error occured + * @param obj_type The object type on which the requested operation cannot be performed. */ NotSupported(const std::string &file, int line, const std::string &func, @@ -181,10 +181,10 @@ class CudaError : public Error { /** * Initializes a CUDA error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the CUDA routine that failed - * @param error_code The resulting CUDA error code + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the CUDA routine that failed + * @param error_code The resulting CUDA error code */ CudaError(const std::string &file, int line, const std::string &func, int64 error_code) @@ -204,10 +204,10 @@ class CublasError : public Error { /** * Initializes a cuBLAS error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the cuBLAS routine that failed - * @param error_code The resulting cuBLAS error code + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the cuBLAS routine that failed + * @param error_code The resulting cuBLAS error code */ CublasError(const std::string &file, int line, const std::string &func, int64 error_code) @@ -227,10 +227,10 @@ class CusparseError : public Error { /** * Initializes a cuSPARSE error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the cuSPARSE routine that failed - * @param error_code The resulting cuSPARSE error code + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the cuSPARSE routine that failed + * @param error_code The resulting cuSPARSE error code */ CusparseError(const std::string &file, int line, const std::string &func, int64 error_code) @@ -242,6 +242,76 @@ class CusparseError : public Error { }; +/** + * HipError is thrown when a HIP routine throws a non-zero error code. + */ +class HipError : public Error { +public: + /** + * Initializes a HIP error. + * + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the HIP routine that failed + * @param error_code The resulting HIP error code + */ + HipError(const std::string &file, int line, const std::string &func, + int64 error_code) + : Error(file, line, func + ": " + get_error(error_code)) + {} + +private: + static std::string get_error(int64 error_code); +}; + + +/** + * HipblasError is thrown when a hipBLAS routine throws a non-zero error code. + */ +class HipblasError : public Error { +public: + /** + * Initializes a hipBLAS error. + * + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the hipBLAS routine that failed + * @param error_code The resulting hipBLAS error code + */ + HipblasError(const std::string &file, int line, const std::string &func, + int64 error_code) + : Error(file, line, func + ": " + get_error(error_code)) + {} + +private: + static std::string get_error(int64 error_code); +}; + + +/** + * HipsparseError is thrown when a hipSPARSE routine throws a non-zero error + * code. + */ +class HipsparseError : public Error { +public: + /** + * Initializes a hipSPARSE error. + * + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the hipSPARSE routine that failed + * @param error_code The resulting hipSPARSE error code + */ + HipsparseError(const std::string &file, int line, const std::string &func, + int64 error_code) + : Error(file, line, func + ": " + get_error(error_code)) + {} + +private: + static std::string get_error(int64 error_code); +}; + + /** * DimensionMismatch is thrown if an operation is being applied to LinOps of * incompatible size. @@ -251,16 +321,16 @@ class DimensionMismatch : public Error { /** * Initializes a dimension mismatch error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The function name where the error occurred - * @param first_name The name of the first operator - * @param first_rows The output dimension of the first operator - * @param first_cols The input dimension of the first operator - * @param second_name The name of the second operator - * @param second_rows The output dimension of the second operator - * @param second_cols The input dimension of the second operator - * @param clarification An additional message describing the error further + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The function name where the error occurred + * @param first_name The name of the first operator + * @param first_rows The output dimension of the first operator + * @param first_cols The input dimension of the first operator + * @param second_name The name of the second operator + * @param second_rows The output dimension of the second operator + * @param second_cols The input dimension of the second operator + * @param clarification An additional message describing the error further */ DimensionMismatch(const std::string &file, int line, const std::string &func, const std::string &first_name, @@ -286,13 +356,13 @@ class BadDimension : public Error { /** * Initializes a bad dimension error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The function name where the error occurred - * @param op_name The name of the operator - * @param op_num_rows The row dimension of the operator - * @param op_num_cols The column dimension of the operator - * @param clarification An additional message further describing the error + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The function name where the error occurred + * @param op_name The name of the operator + * @param op_num_rows The row dimension of the operator + * @param op_num_cols The column dimension of the operator + * @param clarification An additional message further describing the error */ BadDimension(const std::string &file, int line, const std::string &func, const std::string &op_name, size_type op_num_rows, @@ -313,12 +383,12 @@ class ValueMismatch : public Error { /** * Initializes a value mismatch error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The function name where the error occurred - * @param val1 The first value to be compared. - * @param val2 The second value to be compared. - * @param clarification An additional message further describing the error + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The function name where the error occurred + * @param val1 The first value to be compared. + * @param val2 The second value to be compared. + * @param clarification An additional message further describing the error */ ValueMismatch(const std::string &file, int line, const std::string &func, size_type val1, size_type val2, @@ -338,10 +408,10 @@ class AllocationError : public Error { /** * Initializes an allocation error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param device The device on which the error occurred - * @param bytes The size of the memory block whose allocation failed. + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param device The device on which the error occurred + * @param bytes The size of the memory block whose allocation failed. */ AllocationError(const std::string &file, int line, const std::string &device, size_type bytes) @@ -384,10 +454,10 @@ class StreamError : public Error { /** * Initializes a file access error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the function that tried to access the file - * @param message The error message + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the function that tried to access the file + * @param message The error message */ StreamError(const std::string &file, int line, const std::string &func, const std::string &message) @@ -405,9 +475,9 @@ class KernelNotFound : public Error { /** * Initializes a KernelNotFound error. * - * @param file The name of the offending source file - * @param line The source code line number where the error occurred - * @param func The name of the function where the error occurred + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The name of the function where the error occurred */ KernelNotFound(const std::string &file, int line, const std::string &func) : Error(file, line, func + ": unable to find an eligible kernel") @@ -418,4 +488,4 @@ class KernelNotFound : public Error { } // namespace gko -#endif // GKO_CORE_EXCEPTION_HPP_ +#endif // GKO_CORE_BASE_EXCEPTION_HPP_ diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp index 41225f06c98..774ff3fda07 100644 --- a/include/ginkgo/core/base/exception_helpers.hpp +++ b/include/ginkgo/core/base/exception_helpers.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_EXCEPTION_HELPERS_HPP_ -#define GKO_CORE_EXCEPTION_HELPERS_HPP_ +#ifndef GKO_CORE_BASE_EXCEPTION_HELPERS_HPP_ +#define GKO_CORE_BASE_EXCEPTION_HELPERS_HPP_ + + +#include #include @@ -39,9 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - namespace gko { @@ -88,6 +88,34 @@ namespace gko { "semi-colon warnings") +namespace detail { + + +template +struct dynamic_type_helper { + static const std::type_info &get(const T &obj) { return typeid(obj); } +}; + +template +struct dynamic_type_helper::value || + have_ownership()>::type> { + static const std::type_info &get(const T &obj) + { + return obj ? typeid(*obj) : typeid(nullptr); + } +}; + +template +const std::type_info &get_dynamic_type(const T &obj) +{ + return dynamic_type_helper::get(obj); +} + + +} // namespace detail + + /** * Throws a NotSupported exception. * This macro sets the correct information about the location of the error @@ -95,14 +123,14 @@ namespace gko { * * @param _obj the object referenced by NotSupported exception */ -#define GKO_NOT_SUPPORTED(_obj) \ - { \ - throw ::gko::NotSupported( \ - __FILE__, __LINE__, __func__, \ - ::gko::name_demangling::get_type_name(typeid(_obj))); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ +#define GKO_NOT_SUPPORTED(_obj) \ + { \ + throw ::gko::NotSupported(__FILE__, __LINE__, __func__, \ + ::gko::name_demangling::get_type_name( \ + ::gko::detail::get_dynamic_type(_obj))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ "semi-colon warnings") @@ -280,7 +308,7 @@ inline dim<2> get_size(const dim<2> &size) { return size; } /** * Asserts that a cuBLAS library call completed without errors. * - * @param _cuda_call a library call expression + * @param _cublas_call a library call expression */ #define GKO_ASSERT_NO_CUBLAS_ERRORS(_cublas_call) \ do { \ @@ -294,7 +322,7 @@ inline dim<2> get_size(const dim<2> &size) { return size; } /** * Asserts that a cuSPARSE library call completed without errors. * - * @param _cuda_call a library call expression + * @param _cusparse_call a library call expression */ #define GKO_ASSERT_NO_CUSPARSE_ERRORS(_cusparse_call) \ do { \ @@ -305,6 +333,75 @@ inline dim<2> get_size(const dim<2> &size) { return size; } } while (false) +/** + * Instantiates a HipError. + * + * @param errcode The error code returned from a HIP runtime API routine. + */ +#define GKO_HIP_ERROR(_errcode) \ + ::gko::HipError(__FILE__, __LINE__, __func__, _errcode) + + +/** + * Instantiates a HipblasError. + * + * @param errcode The error code returned from the HIPBLAS routine. + */ +#define GKO_HIPBLAS_ERROR(_errcode) \ + ::gko::HipblasError(__FILE__, __LINE__, __func__, _errcode) + + +/** + * Instantiates a HipsparseError. + * + * @param errcode The error code returned from the HIPSPARSE routine. + */ +#define GKO_HIPSPARSE_ERROR(_errcode) \ + ::gko::HipsparseError(__FILE__, __LINE__, __func__, _errcode) + + +/** + * Asserts that a HIP library call completed without errors. + * + * @param _hip_call a library call expression + */ +#define GKO_ASSERT_NO_HIP_ERRORS(_hip_call) \ + do { \ + auto _errcode = _hip_call; \ + if (_errcode != hipSuccess) { \ + throw GKO_HIP_ERROR(_errcode); \ + } \ + } while (false) + + +/** + * Asserts that a HIPBLAS library call completed without errors. + * + * @param _hipblas_call a library call expression + */ +#define GKO_ASSERT_NO_HIPBLAS_ERRORS(_hipblas_call) \ + do { \ + auto _errcode = _hipblas_call; \ + if (_errcode != HIPBLAS_STATUS_SUCCESS) { \ + throw GKO_HIPBLAS_ERROR(_errcode); \ + } \ + } while (false) + + +/** + * Asserts that a HIPSPARSE library call completed without errors. + * + * @param _hipsparse_call a library call expression + */ +#define GKO_ASSERT_NO_HIPSPARSE_ERRORS(_hipsparse_call) \ + do { \ + auto _errcode = _hipsparse_call; \ + if (_errcode != HIPSPARSE_STATUS_SUCCESS) { \ + throw GKO_HIPSPARSE_ERROR(_errcode); \ + } \ + } while (false) + + namespace detail { @@ -357,6 +454,25 @@ inline T ensure_allocated_impl(T ptr, const std::string &file, int line, "semi-colon warnings") +/** + * Ensures that two dimensions have compatible bounds, in particular before a + * copy operation. This means the target should have at least as much elements + * as the source. + * + * @param _source the source of the expected copy operation + * @param _target the destination of the expected copy operation + * + * @throw OutOfBoundsError if `_source > _target` + */ +#define GKO_ENSURE_COMPATIBLE_BOUNDS(_source, _target) \ + if (_source > _target) { \ + throw ::gko::OutOfBoundsError(__FILE__, __LINE__, _source, _target); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + /** * Creates a StreamError exception. * This macro sets the correct information about the location of the error @@ -389,4 +505,4 @@ inline T ensure_allocated_impl(T ptr, const std::string &file, int line, } // namespace gko -#endif // GKO_CORE_EXCEPTION_HELPERS_HPP_ +#endif // GKO_CORE_BASE_EXCEPTION_HELPERS_HPP_ diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 115978f9b18..1df29abc59c 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_EXECUTOR_HPP_ -#define GKO_CORE_EXECUTOR_HPP_ +#ifndef GKO_CORE_BASE_EXECUTOR_HPP_ +#define GKO_CORE_BASE_EXECUTOR_HPP_ #include @@ -50,6 +50,10 @@ struct cublasContext; struct cusparseContext; +struct hipblasContext; + +struct hipsparseContext; + namespace gko { @@ -107,6 +111,9 @@ class ExecutorBase; * void run(const gko::CudaExecutor *exec) const override * { os_ << "CUDA(" << exec->get_device_id() << ")"; } * + * void run(const gko::HipExecutor *exec) const override + * { os_ << "HIP(" << exec->get_device_id() << ")"; } + * * // This is optional, if not overloaded, defaults to OmpExecutor overload * void run(const gko::ReferenceExecutor *) const override * { os_ << "Reference CPU"; } @@ -134,6 +141,7 @@ class ExecutorBase; * auto omp = gko::OmpExecutor::create(); * std::cout << *omp << std::endl * << *gko::CudaExecutor::create(0, omp) << std::endl + * << *gko::HipExecutor::create(0, omp) << std::endl * << *gko::ReferenceExecutor::create() << std::endl; * ``` * @@ -142,15 +150,16 @@ class ExecutorBase; * ``` * OMP * CUDA(0) + * HIP(0) * Reference CPU * ``` * * One might feel that this code is too complicated for such a simple task. * Luckily, there is an overload of the Executor::run() method, which is * designed to facilitate writing simple operations like this one. The method - * takes two closures as input: one which is run for OMP, and the other one for - * CUDA executors. Using this method, there is no need to implement an Operation - * subclass: + * takes three closures as input: one which is run for OMP, one for + * CUDA executors, and the last one for HIP executors. Using this method, there + * is no need to implement an Operation subclass: * * ``` * std::ostream& operator<<(std::ostream &os, const gko::Executor &exec) @@ -160,6 +169,10 @@ class ExecutorBase; * [&]() { os << "CUDA(" // CUDA closure * << static_cast(exec) * .get_device_id() + * << ")"; }, + * [&]() { os << "HIP(" // HIP closure + * << static_cast(exec) + * .get_device_id() * << ")"; }); * return os; * } @@ -237,7 +250,8 @@ private: \ * kernel when the operation is executed. * * The kernels used to bind the operation are searched in `kernels::DEV_TYPE` - * namespace, where `DEV_TYPE` is replaced by `omp`, `cuda` and `reference`. + * namespace, where `DEV_TYPE` is replaced by `omp`, `cuda`, `hip` and + * `reference`. * * @param _name operation name * @param _kernel kernel which will be bound to the operation @@ -246,7 +260,7 @@ private: \ * ------- * * ```c++ - * // define the omp, cuda and reference kernels which will be bound to the + * // define the omp, cuda, hip and reference kernels which will be bound to the * // operation * namespace kernels { * namespace omp { @@ -259,6 +273,11 @@ private: \ * // cuda code * } * } + * namespace hip { + * void my_kernel(int x) { + * // hip code + * } + * } * namespace reference { * void my_kernel(int x) { * // reference code @@ -272,6 +291,7 @@ private: \ * // create executors * auto omp = OmpExecutor::create(); * auto cuda = CudaExecutor::create(omp, 0); + * auto hip = HipExecutor::create(omp, 0); * auto ref = ReferenceExecutor::create(); * * // create the operation @@ -279,6 +299,7 @@ private: \ * * omp->run(op); // run omp kernel * cuda->run(op); // run cuda kernel + * hip->run(op); // run hip kernel * ref->run(op); // run reference kernel * } * ``` @@ -308,6 +329,7 @@ private: \ \ GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(OmpExecutor, omp, _kernel); \ GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(CudaExecutor, cuda, _kernel); \ + GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(HipExecutor, hip, _kernel); \ GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(ReferenceExecutor, reference, \ _kernel); \ \ @@ -335,6 +357,8 @@ private: \ * operations executed on an OpenMP-supporting device (e.g. host CPU); * + CudaExecutor specifies that the data should be stored and the * operations executed on the NVIDIA GPU accelerator; + * + HipExecutor specifies that the data should be stored and the + * operations executed on either an NVIDIA or AMD GPU accelerator; * + ReferenceExecutor executes a non-optimized reference implementation, * which can be used to debug the library. * @@ -433,15 +457,19 @@ class Executor : public log::EnableLogging { * * @tparam ClosureOmp type of op_omp * @tparam ClosureCuda type of op_cuda + * @tparam ClosureHip type of op_hip * * @param op_omp functor to run in case of a OmpExecutor or * ReferenceExecutor * @param op_cuda functor to run in case of a CudaExecutor + * @param op_hip functor to run in case of a HipExecutor */ - template - void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda) const + template + void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, + const ClosureHip &op_hip) const { - LambdaOperation op(op_omp, op_cuda); + LambdaOperation op(op_omp, op_cuda, + op_hip); this->run(op); } @@ -508,6 +536,40 @@ class Executor : public log::EnableLogging { reinterpret_cast(dest_ptr), num_elems * sizeof(T)); } + /** + * Copies data within this Executor. + * + * @tparam T datatype to copy + * + * @param num_elems number of elements of type T to copy + * @param src_ptr pointer to a block of memory containing the data to be + * copied + * @param dest_ptr pointer to an allocated block of memory + * where the data will be copied to + */ + template + void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const + { + this->copy_from(this, num_elems, src_ptr, dest_ptr); + } + + /** + * Retrieves a single element at the given location from executor memory. + * + * @tparam T datatype to copy + * + * @param ptr the pointer to the element to be copied + * + * @return the value stored at ptr + */ + template + T copy_val_to_host(const T *ptr) const + { + T out{}; + this->get_master()->copy_from(this, 1, ptr, &out); + return out; + } + /** * Returns the master OmpExecutor of this Executor. * @return the master OmpExecutor of this Executor. @@ -577,16 +639,19 @@ class Executor : public log::EnableLogging { private: /** - * The LambdaOperation class wraps two functor objects into an Operation. + * The LambdaOperation class wraps three functor objects into an + * Operation. * - * The first object is called by the OmpExecutor, while the other one by the - * CudaExecutor. When run on the ReferenceExecutor, the implementation will - * launch the CPU reference version. + * The first object is called by the OmpExecutor, the second one by the + * CudaExecutor and the last one by the HipExecutor. When run on the + * ReferenceExecutor, the implementation will launch the CPU reference + * version. * * @tparam ClosureOmp the type of the first functor * @tparam ClosureCuda the type of the second functor + * @tparam ClosureHip the type of the third functor */ - template + template class LambdaOperation : public Operation { public: /** @@ -595,9 +660,11 @@ class Executor : public log::EnableLogging { * @param op_omp a functor object which will be called by OmpExecutor * and ReferenceExecutor * @param op_cuda a functor object which will be called by CudaExecutor + * @param op_hip a functor object which will be called by HipExecutor */ - LambdaOperation(const ClosureOmp &op_omp, const ClosureCuda &op_cuda) - : op_omp_(op_omp), op_cuda_(op_cuda) + LambdaOperation(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, + const ClosureHip &op_hip) + : op_omp_(op_omp), op_cuda_(op_cuda), op_hip_(op_hip) {} void run(std::shared_ptr) const override @@ -610,9 +677,15 @@ class Executor : public log::EnableLogging { op_cuda_(); } + void run(std::shared_ptr) const override + { + op_hip_(); + } + private: ClosureOmp op_omp_; ClosureCuda op_cuda_; + ClosureHip op_hip_; }; }; @@ -710,6 +783,43 @@ class ExecutorBase : public Executor { }; +/** + * Controls whether the DeviceReset function should be called thanks to a + * boolean. Note that in any case, `DeviceReset` is called only after destroying + * the last Ginkgo executor. Therefore, it is sufficient to set this flag to the + * last living executor in Ginkgo. Setting this flag to an executor which is not + * destroyed last has no effect. + */ +class EnableDeviceReset { +public: + /** + * Set the device reset capability. + * + * @param device_reset whether to allow a device reset or not + */ + void set_device_reset(bool device_reset) { device_reset_ = device_reset; } + + /** + * Returns the current status of the device reset boolean for this executor. + * + * @return the current status of the device reset boolean for this executor. + */ + bool get_device_reset() { return device_reset_; } + +protected: + /** + * Instantiate an EnableDeviceReset class + * + * @param device_reset the starting device_reset status. Defaults to false. + */ + EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset} + {} + +private: + bool device_reset_{}; +}; + + } // namespace detail @@ -803,7 +913,8 @@ using DefaultExecutor = ReferenceExecutor; * @ingroup Executor */ class CudaExecutor : public detail::ExecutorBase, - public std::enable_shared_from_this { + public std::enable_shared_from_this, + public detail::EnableDeviceReset { friend class detail::ExecutorBase; public: @@ -815,7 +926,8 @@ class CudaExecutor : public detail::ExecutorBase, * kernels */ static std::shared_ptr create( - int device_id, std::shared_ptr master); + int device_id, std::shared_ptr master, + bool device_reset = false); ~CudaExecutor() { decrease_num_execs(this->device_id_); } @@ -838,9 +950,9 @@ class CudaExecutor : public detail::ExecutorBase, static int get_num_devices(); /** - * Get the number of cores per SM of this executor. + * Get the number of warps per SM of this executor. */ - int get_num_cores_per_sm() const noexcept { return num_cores_per_sm_; } + int get_num_warps_per_sm() const noexcept { return num_warps_per_sm_; } /** * Get the number of multiprocessor of this executor. @@ -852,11 +964,14 @@ class CudaExecutor : public detail::ExecutorBase, */ int get_num_warps() const noexcept { - constexpr uint32 warp_size = 32; - auto warps_per_sm = num_cores_per_sm_ / warp_size; - return num_multiprocessor_ * warps_per_sm; + return num_multiprocessor_ * num_warps_per_sm_; } + /** + * Get the warp size of this executor. + */ + int get_warp_size() const noexcept { return warp_size_; } + /** * Get the major verion of compute capability. */ @@ -889,13 +1004,16 @@ class CudaExecutor : public detail::ExecutorBase, void init_handles(); - CudaExecutor(int device_id, std::shared_ptr master) - : device_id_(device_id), + CudaExecutor(int device_id, std::shared_ptr master, + bool device_reset = false) + : EnableDeviceReset{device_reset}, + device_id_(device_id), master_(master), - num_cores_per_sm_(0), + num_warps_per_sm_(0), num_multiprocessor_(0), major_(0), - minor_(0) + minor_(0), + warp_size_(0) { assert(device_id < max_devices && device_id >= 0); this->set_gpu_property(); @@ -930,10 +1048,11 @@ class CudaExecutor : public detail::ExecutorBase, private: int device_id_; std::shared_ptr master_; - int num_cores_per_sm_; + int num_warps_per_sm_; int num_multiprocessor_; int major_; int minor_; + int warp_size_; template using handle_manager = std::unique_ptr>; @@ -953,10 +1072,176 @@ using DefaultExecutor = CudaExecutor; } // namespace kernels +/** + * This is the Executor subclass which represents the HIP enhanced device. + * + * @ingroup exec_hip + * @ingroup Executor + */ +class HipExecutor : public detail::ExecutorBase, + public std::enable_shared_from_this, + public detail::EnableDeviceReset { + friend class detail::ExecutorBase; + +public: + /** + * Creates a new HipExecutor. + * + * @param device_id the HIP device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels + */ + static std::shared_ptr create(int device_id, + std::shared_ptr master, + bool device_reset = false); + + ~HipExecutor() { decrease_num_execs(this->device_id_); } + + std::shared_ptr get_master() noexcept override; + + std::shared_ptr get_master() const noexcept override; + + void synchronize() const override; + + void run(const Operation &op) const override; + + /** + * Get the HIP device id of the device associated to this executor. + */ + int get_device_id() const noexcept { return device_id_; } + + /** + * Get the number of devices present on the system. + */ + static int get_num_devices(); + + /** + * Get the number of warps per SM of this executor. + */ + int get_num_warps_per_sm() const noexcept { return num_warps_per_sm_; } + + /** + * Get the number of multiprocessor of this executor. + */ + int get_num_multiprocessor() const noexcept { return num_multiprocessor_; } + + /** + * Get the major verion of compute capability. + */ + int get_major_version() const noexcept { return major_; } + + /** + * Get the minor verion of compute capability. + */ + int get_minor_version() const noexcept { return minor_; } + + /** + * Get the number of warps of this executor. + */ + int get_num_warps() const noexcept + { + return num_multiprocessor_ * num_warps_per_sm_; + } + + /** + * Get the warp size of this executor. + */ + int get_warp_size() const noexcept { return warp_size_; } + + /** + * Get the hipblas handle for this executor + * + * @return the hipblas handle (hipblasContext*) for this executor + */ + hipblasContext *get_hipblas_handle() const { return hipblas_handle_.get(); } + + /** + * Get the hipsparse handle for this executor + * + * @return the hipsparse handle (hipsparseContext*) for this executor + */ + hipsparseContext *get_hipsparse_handle() const + { + return hipsparse_handle_.get(); + } + +protected: + void set_gpu_property(); + + void init_handles(); + + HipExecutor(int device_id, std::shared_ptr master, + bool device_reset = false) + : EnableDeviceReset{device_reset}, + device_id_(device_id), + master_(master), + num_multiprocessor_(0), + num_warps_per_sm_(0), + major_(0), + minor_(0), + warp_size_(0) + { + assert(device_id < max_devices); + this->set_gpu_property(); + this->init_handles(); + increase_num_execs(device_id); + } + + void *raw_alloc(size_type size) const override; + + void raw_free(void *ptr) const noexcept override; + + GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO); + + static void increase_num_execs(int device_id) + { + std::lock_guard guard(mutex[device_id]); + num_execs[device_id]++; + } + + static void decrease_num_execs(int device_id) + { + std::lock_guard guard(mutex[device_id]); + num_execs[device_id]--; + } + + static int get_num_execs(int device_id) + { + std::lock_guard guard(mutex[device_id]); + return num_execs[device_id]; + } + +private: + int device_id_; + std::shared_ptr master_; + int num_multiprocessor_; + int num_warps_per_sm_; + int major_; + int minor_; + int warp_size_; + + template + using handle_manager = std::unique_ptr>; + handle_manager hipblas_handle_; + handle_manager hipsparse_handle_; + + static constexpr int max_devices = 64; + static int num_execs[max_devices]; + static std::mutex mutex[max_devices]; +}; + + +namespace kernels { +namespace hip { +using DefaultExecutor = HipExecutor; +} // namespace hip +} // namespace kernels + + #undef GKO_OVERRIDE_RAW_COPY_TO } // namespace gko -#endif // GKO_CORE_EXECUTOR_HPP_ +#endif // GKO_CORE_BASE_EXECUTOR_HPP_ diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index 4a85c73326d..b57a9c1cc1a 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_CORE_BASE_LIN_OP_HPP_ #define GKO_CORE_BASE_LIN_OP_HPP_ + +#include +#include + + #include #include #include @@ -44,10 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include - - namespace gko { @@ -220,6 +221,15 @@ class LinOp : public EnableAbstractPolymorphicObject { */ const dim<2> &get_size() const noexcept { return size_; } + /** + * Returns true if the linear operator uses the data given in x as + * an initial guess. Returns false otherwise. + * + * @return true if the linear operator uses the data given in x as + * an initial guess. Returns false otherwise. + */ + virtual bool apply_uses_initial_guess() const { return false; } + protected: /** * Creates a linear operator. @@ -416,6 +426,89 @@ class Transposable { }; +/** + * Linear operators which support permutation should implement the + * Permutable interface. + * + * It provides four functionalities, the row permute, the + * column permute, the inverse row permute and the inverse column permute. + * + * The row permute returns the permutation of the linear operator after + * permuting the rows of the linear operator. For example, if for a matrix A, + * the permuted matrix A' and the permutation array perm, the row i of the + * matrix A is the row perm[i] in the matrix A'. And similarly, for the inverse + * permutation, the row i in the matrix A' is the row perm[i] in the matrix A. + * + * The column permute returns the permutation of the linear operator after + * permuting the columns of the linear operator. The definitions of permute and + * inverse permute for the row_permute hold here as well. + * + * Example: Permuting a Csr matrix: + * ------------------------------------ + * + * ```c++ + * //Permuting an object of LinOp type. + * //The object you want to permute. + * auto op = matrix::Csr::create(exec); + * //Permute the object by first converting it to a Permutable type. + * auto perm = op->row_permute(permutation_indices); + * ``` + */ +template +class Permutable { +public: + virtual ~Permutable() = default; + + /** + * Returns a LinOp representing the row permutation of the Permutable + * object. + * + * @param permutation_indices the array of indices contaning the + * permutation order. + * + * @return a pointer to the new permuted object + */ + virtual std::unique_ptr row_permute( + const Array *permutation_indices) const = 0; + + /** + * Returns a LinOp representing the column permutation of the Permutable + * object. + * + * @param permutation_indices the array of indices contaning the + * permutation order. + * + * @return a pointer to the new column permuted object + */ + virtual std::unique_ptr column_permute( + const Array *permutation_indices) const = 0; + + /** + * Returns a LinOp representing the row permutation of the inverse permuted + * object. + * + * @param inverse_permutation_indices the array of indices contaning the + * inverse permutation order. + * + * @return a pointer to the new inverse permuted object + */ + virtual std::unique_ptr inverse_row_permute( + const Array *inverse_permutation_indices) const = 0; + + /** + * Returns a LinOp representing the row permutation of the inverse permuted + * object. + * + * @param inverse_permutation_indices the array of indices contaning the + * inverse permutation order. + * + * @return a pointer to the new inverse permuted object + */ + virtual std::unique_ptr inverse_column_permute( + const Array *inverse_permutation_indices) const = 0; +}; + + /** * A LinOp implementing this interface can read its data from a matrix_data * structure. @@ -762,7 +855,7 @@ public: \ "semi-colon warnings") -#ifndef __CUDACC__ +#if !(defined(__CUDACC__) || defined(__HIPCC__)) /** * Creates a factory parameter in the factory parameters structure. * @@ -787,18 +880,24 @@ public: \ static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") -#else // __CUDACC__ +#else // defined(__CUDACC__) || defined(__HIPCC__) // A workaround for the NVCC compiler - parameter pack expansion does not work // properly. You won't be able to use factories in code compiled with NVCC, but // at least this won't trigger a compiler error as soon as a header using it is -// included. -#define GKO_FACTORY_PARAMETER(_name, ...) \ - mutable _name{__VA_ARGS__}; \ - \ - template \ - auto with_##_name(Args &&... _value) \ - const->const ::gko::xstd::decay_t & -#endif // __CUDACC__ +// included. To not get a linker error, we provide a dummy body. +#define GKO_FACTORY_PARAMETER(_name, ...) \ + mutable _name{__VA_ARGS__}; \ + \ + template \ + auto with_##_name(Args &&... _value) \ + const->const ::gko::xstd::decay_t & \ + { \ + return *this; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif // defined(__CUDACC__) || defined(__HIPCC__) } // namespace gko diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index e993b18d240..1b7a2f9f18f 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,17 +34,75 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_MATH_HPP_ +#include +#include +#include +#include + + +#include #include #include #include -#include -#include -#include +namespace gko { -namespace gko { +// HIP should not see std::abs or std::sqrt, we want the custom implementation. +// Hence, provide the using declaration only for some cases +namespace kernels { +namespace reference { + + +using std::abs; + + +using std::sqrt; + + +} // namespace reference +} // namespace kernels + + +namespace kernels { +namespace omp { + + +using std::abs; + + +using std::sqrt; + + +} // namespace omp +} // namespace kernels + + +namespace kernels { +namespace cuda { + + +using std::abs; + + +using std::sqrt; + + +} // namespace cuda +} // namespace kernels + + +namespace test { + + +using std::abs; + + +using std::sqrt; + + +} // namespace test // type manipulations @@ -86,6 +144,29 @@ struct is_complex_impl> } // namespace detail +/** + * Access the underlying real type of a complex number. + * + * @tparam T the type being checked. + */ +template +struct cpx_real_type { + /** The type. When the type is not complex, return the type itself.*/ + using type = T; +}; + +/** + * Specialization for complex types. + * + * @copydoc cpx_real_type + */ +template +struct cpx_real_type> { + /** The type. When the type is complex, return the underlying value_type.*/ + using type = typename std::complex::value_type; +}; + + /** * Obtains a real counterpart of a std::complex type, and leaves the type * unchanged if it is not a complex type. @@ -122,6 +203,26 @@ GKO_INLINE GKO_ATTRIBUTES constexpr bool is_complex() namespace detail { +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + template struct reduce_precision_impl { using type = T; @@ -164,9 +265,24 @@ struct increase_precision_impl { }; +template +struct infinity_impl { + // CUDA doesn't allow us to call std::numeric_limits functions + // so we need to store the value instead. + static constexpr auto value = std::numeric_limits::infinity(); +}; + + } // namespace detail +/** + * Obtains the next type in the singly-linked precision list. + */ +template +using next_precision = typename detail::next_precision_impl::type; + + /** * Obtains the next type in the hierarchy with lower precision than T. */ @@ -295,6 +411,128 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) } +#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC + + +/** + * Returns the additive identity for T. + * + * @return additive identity for T + */ +template +GKO_INLINE __host__ constexpr T zero() +{ + return T{}; +} + + +/** + * Returns the additive identity for T. + * + * @return additive identity for T + * + * @note This version takes an unused reference argument to avoid + * complicated calls like `zero()`. Instead, it allows + * `zero(x)`. + */ +template +GKO_INLINE __host__ constexpr T zero(const T &) +{ + return zero(); +} + + +/** + * Returns the multiplicative identity for T. + * + * @return the multiplicative identity for T + */ +template +GKO_INLINE __host__ constexpr T one() +{ + return T(1); +} + + +/** + * Returns the multiplicative identity for T. + * + * @return the multiplicative identity for T + * + * @note This version takes an unused reference argument to avoid + * complicated calls like `one()`. Instead, it allows + * `one(x)`. + */ +template +GKO_INLINE __host__ constexpr T one(const T &) +{ + return one(); +} + + +/** + * Returns the additive identity for T. + * + * @return additive identity for T + */ +template +GKO_INLINE __device__ constexpr xstd::enable_if_t< + !std::is_same>>::value, T> +zero() +{ + return T{}; +} + + +/** + * Returns the additive identity for T. + * + * @return additive identity for T + * + * @note This version takes an unused reference argument to avoid + * complicated calls like `zero()`. Instead, it allows + * `zero(x)`. + */ +template +GKO_INLINE __device__ constexpr T zero(const T &) +{ + return zero(); +} + + +/** + * Returns the multiplicative identity for T. + * + * @return the multiplicative identity for T + */ +template +GKO_INLINE __device__ constexpr xstd::enable_if_t< + !std::is_same>>::value, T> +one() +{ + return T(1); +} + + +/** + * Returns the multiplicative identity for T. + * + * @return the multiplicative identity for T + * + * @note This version takes an unused reference argument to avoid + * complicated calls like `one()`. Instead, it allows + * `one(x)`. + */ +template +GKO_INLINE __device__ constexpr T one(const T &) +{ + return one(); +} + + +#else + + /** * Returns the additive identity for T. * @@ -303,7 +541,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) template GKO_INLINE GKO_ATTRIBUTES constexpr T zero() { - return T(0); + return T{}; } @@ -312,8 +550,9 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero() * * @return additive identity for T * - * @note This version takes an unused reference argument to avoid complicated - * calls like `zero()`. Instead, it allows `zero(x)`. + * @note This version takes an unused reference argument to avoid + * complicated calls like `zero()`. Instead, it allows + * `zero(x)`. */ template GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T &) @@ -339,8 +578,9 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one() * * @return the multiplicative identity for T * - * @note This version takes an unused reference argument to avoid complicated - * calls like `one()`. Instead, it allows `one(x)`. + * @note This version takes an unused reference argument to avoid + * complicated calls like `one()`. Instead, it allows + * `one(x)`. */ template GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T &) @@ -349,6 +589,12 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T &) } +#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC + + +#undef GKO_BIND_ZERO_ONE + + /** * Returns the absolute value of the object. * @@ -365,9 +611,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T abs(const T &x) } -using std::abs; // use optimized abs functions for basic types - - /** * Returns the larger of the arguments. * @@ -418,11 +661,21 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T min(const T &x, const T &y) * @return real part of the object (by default, the object itself) */ template -GKO_ATTRIBUTES GKO_INLINE constexpr T real(const T &x) +GKO_ATTRIBUTES + GKO_INLINE constexpr xstd::enable_if_t::value, T> + real(const T &x) { return x; } +template +GKO_ATTRIBUTES GKO_INLINE constexpr xstd::enable_if_t::value, + remove_complex> +real(const T &x) +{ + return x.real(); +} + /** * Returns the imaginary part of the object. @@ -434,11 +687,21 @@ GKO_ATTRIBUTES GKO_INLINE constexpr T real(const T &x) * @return imaginary part of the object (by default, zero()) */ template -GKO_ATTRIBUTES GKO_INLINE constexpr T imag(const T &) +GKO_ATTRIBUTES + GKO_INLINE constexpr xstd::enable_if_t::value, T> + imag(const T &) { return zero(); } +template +GKO_ATTRIBUTES GKO_INLINE constexpr xstd::enable_if_t::value, + remove_complex> +imag(const T &x) +{ + return x.imag(); +} + /** * Returns the conjugate of an object. @@ -448,13 +711,18 @@ GKO_ATTRIBUTES GKO_INLINE constexpr T imag(const T &) * @return conjugate of the object (by default, the object itself) */ template -GKO_ATTRIBUTES GKO_INLINE T conj(const T &x) +GKO_ATTRIBUTES GKO_INLINE xstd::enable_if_t::value, T> conj( + const T &x) { return x; } - -using std::sqrt; // use standard sqrt functions for basic types +template +GKO_ATTRIBUTES GKO_INLINE xstd::enable_if_t::value, T> conj( + const T &x) +{ + return T{x.real(), -x.imag()}; +} /** @@ -485,8 +753,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T &x) * @return maximum of `hint` and the significant bit position of `n` */ template -GKO_INLINE GKO_ATTRIBUTES constexpr uint32 get_significant_bit( - const T &n, uint32 hint = 0u) noexcept +constexpr uint32 get_significant_bit(const T &n, uint32 hint = 0u) noexcept { return (T{1} << (hint + 1)) > n ? hint : get_significant_bit(n, hint + 1u); } @@ -504,29 +771,32 @@ GKO_INLINE GKO_ATTRIBUTES constexpr uint32 get_significant_bit( * @return the smallest power of `base` not smaller than `limit` */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T get_superior_power( - const T &base, const T &limit, const T &hint = T{1}) noexcept +constexpr T get_superior_power(const T &base, const T &limit, + const T &hint = T{1}) noexcept { return hint >= limit ? hint : get_superior_power(base, limit, hint * base); } -#if !defined(__CUDA_ARCH__) - - -// Since a lot of compiler in combination with CUDA seem to have difficulties -// distinguishing between the CUDA `isfinite` and the `std::isfinite` when -// it is put into the `gko` namespace, only enable `std::isfinite` when -// compiling host code. +/** + * Checks if a floating point number is finite, meaning it is + * neither +/- infinity nor NaN. + * + * @tparam T type of the value to check + * + * @param value value to check + * + * @return `true` if the value is finite, meaning it are neither + * +/- infinity nor NaN. + */ template GKO_INLINE GKO_ATTRIBUTES xstd::enable_if_t::value, bool> -isfinite(const T &value) +is_finite(const T &value) { - return std::isfinite(value); + constexpr T infinity{detail::infinity_impl::value}; + return abs(value) < infinity; } -#endif // defined(__CUDA_ARCH__) - /** * Checks if all components of a complex value are finite, meaning they are @@ -536,14 +806,14 @@ isfinite(const T &value) * * @param value complex value to check * - * returns `true` if both components of the given value are finite, meaning + * @return `true` if both components of the given value are finite, meaning * they are neither +/- infinity nor NaN. */ template GKO_INLINE GKO_ATTRIBUTES xstd::enable_if_t::value, bool> -isfinite(const T &value) +is_finite(const T &value) { - return isfinite(value.real()) && isfinite(value.imag()); + return is_finite(value.real()) && is_finite(value.imag()); } diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 7513dcdc0d8..94c01461079 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_MATRIX_DATA_HPP_ +#include +#include +#include +#include + + #include #include #include @@ -42,12 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include -#include -#include - - namespace gko { diff --git a/include/ginkgo/core/base/mtx_io.hpp b/include/ginkgo/core/base/mtx_io.hpp index a20fb67e572..8ebe65e973a 100644 --- a/include/ginkgo/core/base/mtx_io.hpp +++ b/include/ginkgo/core/base/mtx_io.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/include/ginkgo/core/base/name_demangling.hpp b/include/ginkgo/core/base/name_demangling.hpp index 418f1246367..fb73c7ac370 100644 --- a/include/ginkgo/core/base/name_demangling.hpp +++ b/include/ginkgo/core/base/name_demangling.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,12 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_NAME_DEMANGLING_HPP -#define GKO_CORE_NAME_DEMANGLING_HPP +#ifndef GKO_CORE_BASE_NAME_DEMANGLING_HPP_ +#define GKO_CORE_BASE_NAME_DEMANGLING_HPP_ #include + #ifdef GKO_HAVE_CXXABI_H #include #endif // GKO_HAVE_CXXABI_H @@ -140,4 +141,4 @@ std::string get_enclosing_scope(const T &) } // namespace gko -#endif // GKO_CORE_NAME_DEMANGLING_HPP +#endif // GKO_CORE_BASE_NAME_DEMANGLING_HPP_ diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp index 5c84ef7f8fc..16e4406605b 100644 --- a/include/ginkgo/core/base/perturbation.hpp +++ b/include/ginkgo/core/base/perturbation.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp index 918764fc615..d65e9a1cb31 100644 --- a/include/ginkgo/core/base/polymorphic_object.hpp +++ b/include/ginkgo/core/base/polymorphic_object.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -290,24 +290,24 @@ class PolymorphicObject : public log::EnableLogging { * @see EnablePolymorphicObject for creating a concrete subclass of * PolymorphicObject. */ -template +template class EnableAbstractPolymorphicObject : public PolymorphicBase { public: using PolymorphicBase::PolymorphicBase; - std::unique_ptr create_default( + std::unique_ptr create_default( std::shared_ptr exec) const { - return std::unique_ptr{static_cast( + return std::unique_ptr{static_cast( this->create_default_impl(std::move(exec)).release())}; } - std::unique_ptr create_default() const + std::unique_ptr create_default() const { return this->create_default(this->get_executor()); } - std::unique_ptr clone( + std::unique_ptr clone( std::shared_ptr exec) const { auto new_op = this->create_default(exec); @@ -315,25 +315,25 @@ class EnableAbstractPolymorphicObject : public PolymorphicBase { return new_op; } - std::unique_ptr clone() const + std::unique_ptr clone() const { return this->clone(this->get_executor()); } - AbstactObject *copy_from(const PolymorphicObject *other) + AbstractObject *copy_from(const PolymorphicObject *other) { - return static_cast(this->copy_from_impl(other)); + return static_cast(this->copy_from_impl(other)); } - AbstactObject *copy_from(std::unique_ptr other) + AbstractObject *copy_from(std::unique_ptr other) { - return static_cast( + return static_cast( this->copy_from_impl(std::move(other))); } - AbstactObject *clear() + AbstractObject *clear() { - return static_cast(this->clear_impl()); + return static_cast(this->clear_impl()); } }; diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index e238f276fab..83bff8dd7d5 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -382,7 +382,7 @@ class range { return *this; } - GKO_ATTRIBUTES range(const range &other) = default; + range(const range &other) = default; /** * Returns the length of the specified dimension of the range. diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp index ec252007ee2..4040301f287 100644 --- a/include/ginkgo/core/base/range_accessors.hpp +++ b/include/ginkgo/core/base/range_accessors.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_RANGE_ACCESSORS_HPP_ -#include +#include -#include +#include namespace gko { diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp index 08a56b34cb6..34aa9cf05c3 100644 --- a/include/ginkgo/core/base/std_extensions.hpp +++ b/include/ginkgo/core/base/std_extensions.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_STD_EXTENSIONS_HPP_ -#define GKO_CORE_STD_EXTENSIONS_HPP_ +#ifndef GKO_CORE_BASE_STD_EXTENSIONS_HPP_ +#define GKO_CORE_BASE_STD_EXTENSIONS_HPP_ #include @@ -79,8 +79,78 @@ template using decay_t = typename std::decay::type; +/** + * constexpr helper which checks if lhs > rhs. This is helpful within template + * declarations since ">" cannot directly be used. Note that std::greater is + * available as constexpr only since C++14. This does not implement all the + * functionality of C++14, only what is needed so far. + * + * @tparam T type of both values which are checked + * + * @param lhs first operand + * @param rhs second operand + * + * @return whether lhs > rhs + */ +template +constexpr bool greater(const T &&lhs, const T &&rhs) +{ + return lhs > rhs; +} + +/** + * constexpr helper checking if lhs >= rhs + * + * @tparam T type of both values which are checked + * + * @param lhs first operand + * @param rhs second operand + * + * @return whether lhs >= rhs + */ +template +constexpr bool greater_equal(const T &&lhs, const T &&rhs) +{ + return lhs >= rhs; +} + + +/** + * constexpr helper checking if lhs < rhs + * + * @tparam T type of both values which are checked + * + * @param lhs first operand + * @param rhs second operand + * + * @return whether lhs < rhs + */ +template +constexpr bool less(const T &&lhs, const T &&rhs) +{ + return !greater_equal(lhs, rhs); +} + + +/** + * constexpr helper checking if lhs <= rhs + * + * @tparam T type of both values which are checked + * + * @param lhs first operand + * @param rhs second operand + * + * @return whether lhs <= rhs + */ +template +constexpr bool less_equal(const T &&lhs, const T &&rhs) +{ + return !greater(lhs, rhs); +} + + } // namespace xstd } // namespace gko -#endif // GKO_CORE_STD_EXTENSIONS_HPP_ +#endif // GKO_CORE_BASE_STD_EXTENSIONS_HPP_ diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 4c72c5f1a8c..6c2ab2a50d2 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,32 +30,36 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_TYPES_HPP_ -#define GKO_CORE_TYPES_HPP_ +#ifndef GKO_CORE_BASE_TYPES_HPP_ +#define GKO_CORE_BASE_TYPES_HPP_ #include #include +#include #include #include +#include +#include -#include -#include +#ifdef __HIPCC__ +#include +#endif // Macros for handling different compilers / architectures uniformly - -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) #define GKO_ATTRIBUTES __host__ __device__ #define GKO_INLINE __forceinline__ #else #define GKO_ATTRIBUTES #define GKO_INLINE inline -#endif // __CUDACC__ +#endif // defined(__CUDACC__) || defined(__HIPCC__) -#if defined(__CUDA_ARCH__) && defined(__APPLE__) +#if (defined(__CUDA_ARCH__) && defined(__APPLE__)) || \ + defined(__HIP_DEVICE_COMPILE__) #ifdef NDEBUG #define GKO_ASSERT(condition) ((void)0) @@ -69,12 +73,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. __FILE__, __LINE__, __func__))) #endif // NDEBUG -#else // defined(__CUDA_ARCH__) && defined(__APPLE__) +#else // (defined(__CUDA_ARCH__) && defined(__APPLE__)) || + // defined(__HIP_DEVICE_COMPILE__) // Handle assertions normally on other systems #define GKO_ASSERT(condition) assert(condition) -#endif // defined(__CUDA_ARCH__) && defined(__APPLE__) +#endif // (defined(__CUDA_ARCH__) && defined(__APPLE__)) || + // defined(__HIP_DEVICE_COMPILE__) // Handle deprecated notices correctly on different systems @@ -385,9 +391,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #define GKO_ENABLE_FOR_ALL_EXECUTORS(_enable_macro) \ _enable_macro(OmpExecutor, omp); \ + _enable_macro(HipExecutor, hip); \ _enable_macro(CudaExecutor, cuda) +/** + * Instantiates a template for each non-complex value type compiled by Ginkgo. + * + * @param _macro A macro which expands the template instantiation + * (not including the leading `template` specifier). + * Should take one argument, which is replaced by the + * value type. + */ +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + template _macro(float); \ + template _macro(double) + + /** * Instantiates a template for each value type compiled by Ginkgo. * @@ -396,10 +416,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take one argument, which is replaced by the * value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ - template _macro(float); \ - template _macro(double); \ - template _macro(std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + template _macro(std::complex); \ template _macro(std::complex) @@ -435,7 +454,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(std::complex, int64) +/** + * Instantiates a template for each value type conversion pair compiled by + * Ginkgo. + * + * @param _macro A macro which expands the template instantiation + * (not including the leading `template` specifier). + * Should take two arguments `src` and `dst`, which + * are replaced by the source and destination value type. + */ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex) + + } // namespace gko -#endif // GKO_CORE_TYPES_HPP_ +#endif // GKO_CORE_BASE_TYPES_HPP_ diff --git a/include/ginkgo/core/base/utils.hpp b/include/ginkgo/core/base/utils.hpp index 11434a30b9c..d3b81ffff23 100644 --- a/include/ginkgo/core/base/utils.hpp +++ b/include/ginkgo/core/base/utils.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,16 +34,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_UTILS_HPP_ -#include -#include -#include - - #include #include #include +#include +#include +#include +#include + + #ifndef NDEBUG #include #endif // NDEBUG @@ -290,7 +291,10 @@ inline typename std::decay::type *as(U *obj) if (auto p = dynamic_cast::type *>(obj)) { return p; } else { - throw NotSupported(__FILE__, __LINE__, __func__, typeid(obj).name()); + throw NotSupported(__FILE__, __LINE__, + std::string{"gko::as<"} + + name_demangling::get_type_name(typeid(T)) + ">", + name_demangling::get_type_name(typeid(*obj))); } } @@ -313,7 +317,88 @@ inline const typename std::decay::type *as(const U *obj) if (auto p = dynamic_cast::type *>(obj)) { return p; } else { - throw NotSupported(__FILE__, __LINE__, __func__, typeid(obj).name()); + throw NotSupported(__FILE__, __LINE__, + std::string{"gko::as<"} + + name_demangling::get_type_name(typeid(T)) + ">", + name_demangling::get_type_name(typeid(*obj))); + } +} + + +/** + * Performs polymorphic type conversion of a unique_ptr. + * + * @tparam T requested result type + * @tparam U static type of the passed object + * + * @param obj the unique_ptr to the object which should be converted. + * If successful, it will be reset to a nullptr. + * + * @return If successful, returns a unique_ptr to the subtype, otherwise throws + * NotSupported. + */ +template +inline std::unique_ptr::type> as( + std::unique_ptr &&obj) +{ + if (auto p = dynamic_cast::type *>(obj.get())) { + obj.release(); + return std::unique_ptr::type>{p}; + } else { + throw NotSupported(__FILE__, __LINE__, __func__, + name_demangling::get_type_name(typeid(*obj))); + } +} + + +/** + * Performs polymorphic type conversion of a shared_ptr. + * + * @tparam T requested result type + * @tparam U static type of the passed object + * + * @param obj the shared_ptr to the object which should be converted. + * + * @return If successful, returns a shared_ptr to the subtype, otherwise throws + * NotSupported. This pointer shares ownership with the input pointer. + */ +template +inline std::shared_ptr::type> as(std::shared_ptr obj) +{ + auto ptr = std::dynamic_pointer_cast::type>(obj); + if (ptr) { + return ptr; + } else { + throw NotSupported(__FILE__, __LINE__, __func__, + name_demangling::get_type_name(typeid(*obj))); + } +} + + +/** + * Performs polymorphic type conversion of a shared_ptr. + * + * This is the constant version of the function. + * + * @tparam T requested result type + * @tparam U static type of the passed object + * + * @param obj the shared_ptr to the object which should be converted. + * + * @return If successful, returns a shared_ptr to the subtype, otherwise throws + * NotSupported. This pointer shares ownership with the input pointer. + */ +template +inline std::shared_ptr::type> as( + std::shared_ptr obj) +{ + auto ptr = + std::dynamic_pointer_cast::type>(obj); + if (ptr) { + return ptr; + } else { + throw NotSupported(__FILE__, __LINE__, __func__, + name_demangling::get_type_name(typeid(*obj))); } } diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp index 580f28eb394..52731ab56e3 100644 --- a/include/ginkgo/core/base/version.hpp +++ b/include/ginkgo/core/base/version.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,12 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_VERSION_HPP_ -#include -#include +#include -#include -#include +#include +#include namespace gko { @@ -81,24 +80,41 @@ struct version { const char *const tag; }; +inline bool operator==(const version &first, const version &second) +{ + return first.major == second.major && first.minor == second.minor && + first.patch == second.patch; +} + +inline bool operator!=(const version &first, const version &second) +{ + return !(first == second); +} -#define GKO_ENABLE_VERSION_COMPARISON(_operator) \ - inline bool operator _operator(const version &first, \ - const version &second) \ - { \ - return std::tie(first.major, first.minor, first.patch) \ - _operator std::tie(second.major, second.minor, second.patch); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - -GKO_ENABLE_VERSION_COMPARISON(<); -GKO_ENABLE_VERSION_COMPARISON(<=); -GKO_ENABLE_VERSION_COMPARISON(==); -GKO_ENABLE_VERSION_COMPARISON(!=); -GKO_ENABLE_VERSION_COMPARISON(>=); -GKO_ENABLE_VERSION_COMPARISON(>); +inline bool operator<(const version &first, const version &second) +{ + if (first.major < second.major) return true; + if (first.major == second.major && first.minor < second.minor) return true; + if (first.major == second.major && first.minor == second.minor && + first.patch < second.patch) + return true; + return false; +} + +inline bool operator<=(const version &first, const version &second) +{ + return !(second < first); +} + +inline bool operator>(const version &first, const version &second) +{ + return second < first; +} + +inline bool operator>=(const version &first, const version &second) +{ + return !(first < second); +} #undef GKO_ENABLE_VERSION_COMPARISON @@ -138,7 +154,7 @@ inline std::ostream &operator<<(std::ostream &os, const version &ver) * earlier version may have this implemented or fixed in a later version). * * This structure provides versions of different parts of Ginkgo: the headers, - * the core and the kernel modules (reference, OpenMP, CUDA). + * the core and the kernel modules (reference, OpenMP, CUDA, HIP). * To obtain an instance of version_info filled with information about the * current version of Ginkgo, call the version_info::get() static method. */ @@ -189,6 +205,13 @@ class version_info { */ version cuda_version; + /** + * Contains version information of the HIP module. + * + * This is the version of the static/shared library called "ginkgo_hip". + */ + version hip_version; + private: static constexpr version get_header_version() noexcept { @@ -204,12 +227,15 @@ class version_info { static version get_cuda_version() noexcept; + static version get_hip_version() noexcept; + version_info() : header_version{get_header_version()}, core_version{get_core_version()}, reference_version{get_reference_version()}, omp_version{get_omp_version()}, - cuda_version{get_cuda_version()} + cuda_version{get_cuda_version()}, + hip_version{get_hip_version()} {} }; diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp new file mode 100644 index 00000000000..1d92bc59c54 --- /dev/null +++ b/include/ginkgo/core/factorization/ilu.hpp @@ -0,0 +1,153 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_ILU_HPP_ +#define GKO_CORE_FACTORIZATION_ILU_HPP_ + + +#include + + +#include +#include +#include +#include + + +namespace gko { +/** + * @brief The Factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +/** + * Represents an incomplete LU factorization -- ILU(0) -- of a sparse matrix. + * + * More specifically, it consists of a lower unitriangular factor $L$ and + * an upper triangular factor $U$ with sparsity pattern + * $\mathcal S(L + U)$ = $\mathcal S(A)$ + * fulfilling $LU = A$ at every non-zero location of $A$. + * + * @tparam ValueType Type of the values of all matrices used in this class + * @tparam IndexType Type of the indices of all matrices used in this class + * + * @ingroup factor + * @ingroup LinOp + */ +template +class Ilu : public Composition { +public: + using value_type = ValueType; + using index_type = IndexType; + using matrix_type = matrix::Csr; + + std::shared_ptr get_l_factor() const + { + // Can be `static_cast` since the type is guaranteed in this class + return std::static_pointer_cast( + this->get_operators()[0]); + } + + std::shared_ptr get_u_factor() const + { + // Can be `static_cast` since the type is guaranteed in this class + return std::static_pointer_cast( + this->get_operators()[1]); + } + + // Remove the possibility of calling `create`, which was enabled by + // `Composition` + template + static std::unique_ptr> create(Args &&... args) = + delete; + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * Strategy which will be used by the L matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(l_strategy, nullptr); + + /** + * Strategy which will be used by the U matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(u_strategy, nullptr); + }; + GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + Ilu(const Factory *factory, std::shared_ptr system_matrix) + : Composition{factory->get_executor()}, + parameters_{factory->get_parameters()} + { + if (parameters_.l_strategy == nullptr) { + parameters_.l_strategy = + std::make_shared(); + } + if (parameters_.u_strategy == nullptr) { + parameters_.u_strategy = + std::make_shared(); + } + generate_l_u(system_matrix)->move_to(this); + } + + /** + * Generates the incomplete LU factors, which will be returned as a + * composition of the lower (first element of the composition) and the + * upper factor (second element). The dynamic type of L is l_matrix_type, + * while the dynamic type of U is u_matrix_type. + * + * @param system_matrix the source matrix used to generate the factors. + * @note: system_matrix must be convertible to a Csr + * Matrix, otherwise, an exception is thrown. + * @return A Composition, containing the incomplete LU factors for the + * given system_matrix (first element is L, then U) + */ + std::unique_ptr> generate_l_u( + const std::shared_ptr &system_matrix) const; +}; + + +} // namespace factorization +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_ILU_HPP_ diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp new file mode 100644 index 00000000000..d1f22f2f029 --- /dev/null +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -0,0 +1,253 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_PAR_ICT_HPP_ +#define GKO_CORE_FACTORIZATION_PAR_ICT_HPP_ + + +#include + + +#include +#include +#include +#include + + +namespace gko { +/** + * @brief The Factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +/** + * ParICT is an incomplete threshold-based Cholesky factorization which is + * computed in parallel. + * + * $L$ is a lower triangular matrix which approximates a given symmetric + * positive definite matrix $A$ with $A \approx LL^T$. Here, $L$ has a sparsity + * pattern that is improved iteratively based on its element-wise magnitude. + * The initial sparsity pattern is chosen based on the lower triangle of $A$. + * + * One iteration of the ParICT algorithm consists of the following steps: + * + * 1. Calculating the residual $R = A - LL^T$ + * 2. Adding new non-zero locations from $R$ to $L$. + * The new non-zero locations are initialized based on the corresponding + * residual value. + * 3. Executing a fixed-point iteration on $L$ according to + * $ + * F(L) = + * \begin{cases} + * \frac{1}{l_{jj}} + * \left(a_{ij}-\sum_{k=1}^{j-1}l_{ik}l_{jk}\right), \quad & i \neq j \\ + * \sqrt{a_{ij}-\sum_{k=1}^{j-1}l_{ik}l_{jk}}, \quad & i = j \\ + * \end{cases} + * $ + * 4. Removing the smallest entries (by magnitude) from $L$ + * 5. Executing a fixed-point iteration on the (now sparser) $L$ + * + * This ParICT algorithm thus improves the sparsity pattern and the + * approximation of $L$ simultaneously. + * + * The implementation follows the design of H. Anzt et al., + * ParILUT - A Parallel Threshold ILU for GPUs, 2019 IEEE International + * Parallel and Distributed Processing Symposium (IPDPS), pp. 231–241. + * + * @tparam ValueType Type of the values of all matrices used in this class + * @tparam IndexType Type of the indices of all matrices used in this class + * + * @ingroup factor + * @ingroup LinOp + */ +template +class ParIct : public Composition { +public: + using value_type = ValueType; + using index_type = IndexType; + using matrix_type = matrix::Csr; + + std::shared_ptr get_l_factor() const + { + // Can be `static_cast` since the type is guaranteed in this class + return std::static_pointer_cast( + this->get_operators()[0]); + } + + std::shared_ptr get_lt_factor() const + { + // Can be `static_cast` since the type is guaranteed in this class + return std::static_pointer_cast( + this->get_operators()[1]); + } + + // Remove the possibility of calling `create`, which was enabled by + // `Composition` + template + static std::unique_ptr> create(Args &&... args) = + delete; + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * The number of total iterations of ParICT that will be executed. + * The default value is 5. + */ + size_type GKO_FACTORY_PARAMETER(iterations, 5); + + /** + * @brief `true` means it is known that the matrix given to this + * factory will be sorted first by row, then by column index, + * `false` means it is unknown or not sorted, so an additional + * sorting step will be performed during the factorization + * (it will not change the matrix given). + * The matrix must be sorted for this factorization to work. + * + * The `system_matrix`, which will be given to this factory, must be + * sorted (first by row, then by column) in order for the algorithm + * to work. If it is known that the matrix will be sorted, this + * parameter can be set to `true` to skip the sorting (therefore, + * shortening the runtime). + * However, if it is unknown or if the matrix is known to be not sorted, + * it must remain `false`, otherwise, the factorization might be + * incorrect. + */ + bool GKO_FACTORY_PARAMETER(skip_sorting, false); + + /** + * @brief `true` means the candidate selection will use an inexact + * selection algorithm. `false` means an exact selection algorithm will + * be used. + * + * Using the approximate selection algorithm can give a significant + * speed-up, but may in the worst case cause the algorithm to vastly + * exceed its `fill_in_limit`. + * The exact selection needs more time, but more closely fulfills the + * `fill_in_limit` except for pathological cases (many candidates with + * equal magnitude). + * + * The default behavior is to use approximate selection. + */ + bool GKO_FACTORY_PARAMETER(approximate_select, true); + + /** + * @brief `true` means the sample used for the selection algorithm will + * be chosen deterministically. This is only relevant when using + * `approximate_select`. It is mostly used for testing. + * + * The selection algorithm used for `approximate_select` uses a small + * sample of the input data to determine an approximate threshold. + * The choice of elements can either be randomized, i.e., we may use + * different elements during each execution, or deterministic, i.e., the + * element choices are always the same. + * + * Note that even though the threshold selection step may be made + * deterministic this way, the calculation of the IC factors can still + * be non-deterministic due to its asynchronous iterations. + * + * The default behavior is to use a random sample. + */ + bool GKO_FACTORY_PARAMETER(deterministic_sample, false); + + /** + * @brief the amount of fill-in that is allowed in L compared to + * the lower triangle of A. + * + * The threshold for removing candidates from the intermediate L + * is set such that the resulting sparsity pattern has at most + * `fill_in_limit` times the number of non-zeros of the lower triangle + * of A factorization.. + * + * The default value `2.0` allows twice the number of non-zeros in + * L compared to the lower triangle of A. + */ + double GKO_FACTORY_PARAMETER(fill_in_limit, 2.0); + + /** + * Strategy which will be used by the L matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(l_strategy, nullptr); + + /** + * Strategy which will be used by the L^T matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(lt_strategy, nullptr); + }; + GKO_ENABLE_LIN_OP_FACTORY(ParIct, parameters, Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + explicit ParIct(const Factory *factory, + std::shared_ptr system_matrix) + : Composition(factory->get_executor()), + parameters_{factory->get_parameters()} + { + if (parameters_.l_strategy == nullptr) { + parameters_.l_strategy = + std::make_shared(); + } + if (parameters_.lt_strategy == nullptr) { + parameters_.lt_strategy = + std::make_shared(); + } + generate_l_lt(std::move(system_matrix))->move_to(this); + } + + /** + * Generates the incomplete LL^T factors, which will be returned as a + * composition of the lower (first element of the composition) and the + * upper factor (second element). The dynamic type of L and L^T is + * matrix_type + * + * @param system_matrix the source matrix used to generate the factors. + * @note: system_matrix must be convertable to a Csr + * Matrix, otherwise, an exception is thrown. + * @return A Composition, containing the incomplete LU factors for the + * given system_matrix (first element is L, then L^T) + */ + std::unique_ptr> generate_l_lt( + const std::shared_ptr &system_matrix) const; +}; + + +} // namespace factorization +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_PAR_ICT_HPP_ diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index 8e42b4dc5a9..f3f4458a581 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -147,6 +147,20 @@ class ParIlu : public Composition { * incorrect. */ bool GKO_FACTORY_PARAMETER(skip_sorting, false); + + /** + * Strategy which will be used by the L matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(l_strategy, nullptr); + + /** + * Strategy which will be used by the U matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(u_strategy, nullptr); }; GKO_ENABLE_LIN_OP_FACTORY(ParIlu, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); @@ -157,7 +171,17 @@ class ParIlu : public Composition { : Composition(factory->get_executor()), parameters_{factory->get_parameters()} { - generate_l_u(system_matrix, parameters_.skip_sorting)->move_to(this); + if (parameters_.l_strategy == nullptr) { + parameters_.l_strategy = + std::make_shared(); + } + if (parameters_.u_strategy == nullptr) { + parameters_.u_strategy = + std::make_shared(); + } + generate_l_u(system_matrix, parameters_.skip_sorting, + parameters_.l_strategy, parameters_.u_strategy) + ->move_to(this); } /** @@ -167,17 +191,21 @@ class ParIlu : public Composition { * while the dynamic type of U is u_matrix_type. * * @param system_matrix the source matrix used to generate the factors. - * @note: system_matrix must be convertable to a Csr + * @note: system_matrix must be convertible to a Csr * Matrix, otherwise, an exception is thrown. * @param skip_sorting if set to `true`, the sorting will be skipped. * @note: If the matrix is not sorted, the * factorization fails. + * @param l_strategy Strategy, which will be used by the L matrix. + * @param u_strategy Strategy, which will be used by the U matrix. * @return A Composition, containing the incomplete LU factors for the * given system_matrix (first element is L, then U) */ std::unique_ptr> generate_l_u( - const std::shared_ptr &system_matrix, - bool skip_sorting) const; + const std::shared_ptr &system_matrix, bool skip_sorting, + std::shared_ptr l_strategy, + std::shared_ptr u_strategy) + const; }; diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp new file mode 100644 index 00000000000..364a7bd9351 --- /dev/null +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -0,0 +1,258 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_FACTORIZATION_PAR_ILUT_HPP_ +#define GKO_CORE_FACTORIZATION_PAR_ILUT_HPP_ + + +#include + + +#include +#include +#include +#include + + +namespace gko { +/** + * @brief The Factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +/** + * ParILUT is an incomplete threshold-based LU factorization which is computed + * in parallel. + * + * $L$ is a lower unitriangular, while $U$ is an upper triangular matrix, which + * approximate a given matrix $A$ with $A \approx LU$. Here, $L$ and $U$ have + * a sparsity pattern that is improved iteratively based on their element-wise + * magnitude. The initial sparsity pattern is chosen based on the $ILU(0)$ + * factorization of $A$. + * + * One iteration of the ParILUT algorithm consists of the following steps: + * + * 1. Calculating the residual $R = A - LU$ + * 2. Adding new non-zero locations from $R$ to $L$ and $U$. + * The new non-zero locations are initialized based on the corresponding + * residual value. + * 3. Executing a fixed-point iteration on $L$ and $U$ according to + * $ + * F(L, U) = + * \begin{cases} + * \frac{1}{u_{jj}} + * \left(a_{ij}-\sum_{k=1}^{j-1}l_{ik}u_{kj}\right), \quad & i>j \\ + * a_{ij}-\sum_{k=1}^{i-1}l_{ik}u_{kj}, \quad & i\leq j + * \end{cases} + * $ + * For a more detailed description of the fixed-point iteration, see + * @ref ParIlu. + * 4. Removing the smallest entries (by magnitude) from $L$ and $U$ + * 5. Executing a fixed-point iteration on the (now sparser) $L$ and $U$ + * + * This ParILUT algorithm thus improves the sparsity pattern and the + * approximation of $L$ and $U$ simultaneously. + * + * The implementation follows the design of H. Anzt et al., + * ParILUT - A Parallel Threshold ILU for GPUs, 2019 IEEE International + * Parallel and Distributed Processing Symposium (IPDPS), pp. 231–241. + * + * @tparam ValueType Type of the values of all matrices used in this class + * @tparam IndexType Type of the indices of all matrices used in this class + * + * @ingroup factor + * @ingroup LinOp + */ +template +class ParIlut : public Composition { +public: + using value_type = ValueType; + using index_type = IndexType; + using l_matrix_type = matrix::Csr; + using u_matrix_type = matrix::Csr; + + std::shared_ptr get_l_factor() const + { + // Can be `static_cast` since the type is guaranteed in this class + return std::static_pointer_cast( + this->get_operators()[0]); + } + + std::shared_ptr get_u_factor() const + { + // Can be `static_cast` since the type is guaranteed in this class + return std::static_pointer_cast( + this->get_operators()[1]); + } + + // Remove the possibility of calling `create`, which was enabled by + // `Composition` + template + static std::unique_ptr> create(Args &&... args) = + delete; + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * The number of total iterations of ParILUT that will be executed. + * The default value is 5. + */ + size_type GKO_FACTORY_PARAMETER(iterations, 5); + + /** + * @brief `true` means it is known that the matrix given to this + * factory will be sorted first by row, then by column index, + * `false` means it is unknown or not sorted, so an additional + * sorting step will be performed during the factorization + * (it will not change the matrix given). + * The matrix must be sorted for this factorization to work. + * + * The `system_matrix`, which will be given to this factory, must be + * sorted (first by row, then by column) in order for the algorithm + * to work. If it is known that the matrix will be sorted, this + * parameter can be set to `true` to skip the sorting (therefore, + * shortening the runtime). + * However, if it is unknown or if the matrix is known to be not sorted, + * it must remain `false`, otherwise, the factorization might be + * incorrect. + */ + bool GKO_FACTORY_PARAMETER(skip_sorting, false); + + /** + * @brief `true` means the candidate selection will use an inexact + * selection algorithm. `false` means an exact selection algorithm will + * be used. + * + * Using the approximate selection algorithm can give a significant + * speed-up, but may in the worst case cause the algorithm to vastly + * exceed its `fill_in_limit`. + * The exact selection needs more time, but more closely fulfills the + * `fill_in_limit` except for pathological cases (many candidates with + * equal magnitude). + * + * The default behavior is to use approximate selection. + */ + bool GKO_FACTORY_PARAMETER(approximate_select, true); + + /** + * @brief `true` means the sample used for the selection algorithm will + * be chosen deterministically. This is only relevant when using + * `approximate_select`. It is mostly used for testing. + * + * The selection algorithm used for `approximate_select` uses a small + * sample of the input data to determine an approximate threshold. + * The choice of elements can either be randomized, i.e., we may use + * different elements during each execution, or deterministic, i.e., the + * element choices are always the same. + * + * Note that even though the threshold selection step may be made + * deterministic this way, the calculation of the ILU factors can still + * be non-deterministic due to its asynchronous iterations. + * + * The default behavior is to use a random sample. + */ + bool GKO_FACTORY_PARAMETER(deterministic_sample, false); + + /** + * @brief the amount of fill-in that is allowed in L and U compared to + * the ILU(0) factorization. + * + * The threshold for removing candidates from the intermediate L and U + * is set such that the resulting sparsity pattern has at most + * `fill_in_limit` times the number of non-zeros of the ILU(0) + * factorization. This selection is executed separately for both + * factors L and U. + * + * The default value `2.0` allows twice the number of non-zeros in + * L and U compared to ILU(0). + */ + double GKO_FACTORY_PARAMETER(fill_in_limit, 2.0); + + /** + * Strategy which will be used by the L matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(l_strategy, nullptr); + + /** + * Strategy which will be used by the U matrix. The default value + * `nullptr` will result in the strategy `classical`. + */ + std::shared_ptr + GKO_FACTORY_PARAMETER(u_strategy, nullptr); + }; + GKO_ENABLE_LIN_OP_FACTORY(ParIlut, parameters, Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + explicit ParIlut(const Factory *factory, + std::shared_ptr system_matrix) + : Composition(factory->get_executor()), + parameters_{factory->get_parameters()} + { + if (parameters_.l_strategy == nullptr) { + parameters_.l_strategy = + std::make_shared(); + } + if (parameters_.u_strategy == nullptr) { + parameters_.u_strategy = + std::make_shared(); + } + generate_l_u(std::move(system_matrix))->move_to(this); + } + + /** + * Generates the incomplete LU factors, which will be returned as a + * composition of the lower (first element of the composition) and the + * upper factor (second element). The dynamic type of L is l_matrix_type, + * while the dynamic type of U is u_matrix_type. + * + * @param system_matrix the source matrix used to generate the factors. + * @note: system_matrix must be convertable to a Csr + * Matrix, otherwise, an exception is thrown. + * @return A Composition, containing the incomplete LU factors for the + * given system_matrix (first element is L, then U) + */ + std::unique_ptr> generate_l_u( + const std::shared_ptr &system_matrix) const; +}; + + +} // namespace factorization +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_PAR_ILUT_HPP_ diff --git a/include/ginkgo/core/log/convergence.hpp b/include/ginkgo/core/log/convergence.hpp index 5079f587137..64cd16ea1aa 100644 --- a/include/ginkgo/core/log/convergence.hpp +++ b/include/ginkgo/core/log/convergence.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,12 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_LOG_CONVERGENCE_HPP_ -#include - - #include +#include #include #include diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index 9b0abe9ed47..90eb1a3063c 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_LOGGER_HPP_ -#define GKO_CORE_LOGGER_HPP_ +#ifndef GKO_CORE_LOG_LOGGER_HPP_ +#define GKO_CORE_LOG_LOGGER_HPP_ #include @@ -556,4 +556,4 @@ class EnableLogging : public Loggable { } // namespace gko -#endif // GKO_CORE_LOGGER_HPP_ +#endif // GKO_CORE_LOG_LOGGER_HPP_ diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp index 7d8279c023d..7b54e478f6e 100644 --- a/include/ginkgo/core/log/papi.hpp +++ b/include/ginkgo/core/log/papi.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -46,18 +46,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include #include -#include "third_party/papi_sde/papi_sde_interface.h" -#include +#include "third_party/papi_sde/papi_sde_interface.h" namespace gko { namespace log { +static size_type papi_logger_count = 0; +static std::mutex papi_count_mutex; + + /** * Papi is a Logger which logs every event to the PAPI software. Thanks to this * logger, applications which interface with PAPI can access Ginkgo internal @@ -179,7 +185,7 @@ class Papi : public Logger { */ static std::shared_ptr create( std::shared_ptr exec, - const Logger::mask_type &enabled_events) + const Logger::mask_type &enabled_events = Logger::all_events_mask) { return std::shared_ptr(new Papi(exec, enabled_events)); } @@ -200,11 +206,11 @@ class Papi : public Logger { { std::ostringstream os; - std::lock_guard guard(count_mutex); - os << "ginkgo" << logger_count; + std::lock_guard guard(papi_count_mutex); + os << "ginkgo" << papi_logger_count; name = os.str(); papi_handle = papi_sde_init(name.c_str()); - logger_count++; + papi_logger_count++; } private: @@ -301,8 +307,6 @@ class Papi : public Logger { mutable papi_queue iteration_complete{&papi_handle, "iteration_complete"}; - static size_type logger_count; - std::mutex count_mutex; std::string name{"ginkgo"}; papi_handle_t papi_handle; @@ -314,4 +318,4 @@ class Papi : public Logger { #endif // GKO_HAVE_PAPI_SDE -#endif // GKO_CORE_LOG_OSTREAM_HPP_ +#endif // GKO_CORE_LOG_PAPI_HPP_ diff --git a/include/ginkgo/core/log/record.hpp b/include/ginkgo/core/log/record.hpp index f675dda4495..0c791e5e278 100644 --- a/include/ginkgo/core/log/record.hpp +++ b/include/ginkgo/core/log/record.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,13 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_LOG_RECORD_HPP_ -#include - - #include #include +#include #include #include diff --git a/include/ginkgo/core/log/stream.hpp b/include/ginkgo/core/log/stream.hpp index 02fd4ae46f2..d46a0d07be0 100644 --- a/include/ginkgo/core/log/stream.hpp +++ b/include/ginkgo/core/log/stream.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,13 +34,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_LOG_STREAM_HPP_ -#include - - #include #include +#include + + namespace gko { namespace log { diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 36468411a8a..e50c8c033d5 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -55,6 +55,10 @@ template class Dense; +template +class CooBuilder; + + /** * COO stores a matrix in the coordinate matrix format. * @@ -72,6 +76,7 @@ class Dense; template class Coo : public EnableLinOp>, public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ReadableFromMatrixData, @@ -80,6 +85,7 @@ class Coo : public EnableLinOp>, friend class EnablePolymorphicObject; friend class Csr; friend class Dense; + friend class CooBuilder; public: using EnableLinOp::convert_to; @@ -89,6 +95,13 @@ class Coo : public EnableLinOp>, using index_type = IndexType; using mat_data = matrix_data; + friend class Coo, IndexType>; + + void convert_to( + Coo, IndexType> *result) const override; + + void move_to(Coo, IndexType> *result) override; + void convert_to(Csr *other) const override; void move_to(Csr *other) override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 4dd2e5d862c..227b6b06a28 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -60,6 +60,22 @@ class Sellp; template class SparsityCsr; +template +class Csr; + +template +class CsrBuilder; + + +namespace detail { + + +template +void strategy_rebuild_helper(Csr *result); + + +} // namespace detail + /** * CSR is a matrix format which stores only the nonzero coefficients by @@ -70,6 +86,28 @@ class SparsityCsr; * An additional column index array is used to identify the column of each * nonzero element. * + * The Csr LinOp supports different operations: + * + * ```cpp + * matrix::Csr *A, *B, *C; // matrices + * matrix::Dense *b, *x; // vectors tall-and-skinny matrices + * matrix::Dense *alpha, *beta; // scalars of dimension 1x1 + * matrix::Identity *I; // identity matrix + * + * // Applying to Dense matrices computes an SpMV/SpMM product + * A->apply(b, x) // x = A*b + * A->apply(alpha, b, beta, x) // x = alpha*A*b + beta*x + * + * // Applying to Csr matrices computes a SpGEMM product of two sparse matrices + * A->apply(B, C) // C = A*B + * A->apply(alpha, B, beta, C) // C = alpha*A*B + beta*C + * + * // Applying to an Identity matrix computes a SpGEAM sparse matrix addition + * A->apply(alpha, I, beta, B) // B = alpha*A + beta*B + * ``` + * Both the SpGEMM and SpGEAM operation require the input matrices to be sorted + * by column index, otherwise the algorithms will produce incorrect results. + * * @tparam ValueType precision of matrix elements * @tparam IndexType precision of matrix indexes * @@ -80,6 +118,7 @@ class SparsityCsr; template class Csr : public EnableLinOp>, public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -88,7 +127,8 @@ class Csr : public EnableLinOp>, public ConvertibleTo>, public ReadableFromMatrixData, public WritableToMatrixData, - public Transposable { + public Transposable, + public Permutable { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Coo; @@ -97,30 +137,64 @@ class Csr : public EnableLinOp>, friend class Hybrid; friend class Sellp; friend class SparsityCsr; + friend class CsrBuilder; public: - using EnableLinOp::convert_to; - using EnableLinOp::move_to; - using value_type = ValueType; using index_type = IndexType; + using transposed_type = Csr; using mat_data = matrix_data; class automatical; + /** + * strategy_type is to decide how to set the csr algorithm. + * + * The practical strategy method should inherit strategy_type and implement + * its `process`, `clac_size` function and the corresponding device kernel. + */ class strategy_type { friend class automatical; public: + /** + * Creates a strategy_type. + * + * @param name the name of strategy + */ strategy_type(std::string name) : name_(name) {} + /** + * Returns the name of strategy + * + * @return the name of strategy + */ std::string get_name() { return name_; } + /** + * Computes srow according to row pointers. + * + * @param mtx_row_ptrs the row pointers of the matrix + * @param mtx_srow the srow of the matrix + */ virtual void process(const Array &mtx_row_ptrs, Array *mtx_srow) = 0; + /** + * Computes the srow size according to the number of nonzeros. + * + * @param nnz the number of nonzeros + * + * @return the size of srow + */ virtual int64_t clac_size(const int64_t nnz) = 0; + /** + * Copy a strategy. This is a workaround until strategies are revamped, + * since strategies like `automatical` do not work when actually shared. + */ + virtual std::shared_ptr copy() = 0; + protected: void set_name(std::string name) { name_ = name; } @@ -128,58 +202,183 @@ class Csr : public EnableLinOp>, std::string name_; }; + /** + * classical is a strategy_type which uses the same number of threads on + * each row. Classical strategy uses multithreads to calculate on parts of + * rows and then do a reduction of these threads results. The number of + * threads per row depends on the max number of stored elements per row. + */ class classical : public strategy_type { public: - classical() : strategy_type("classical") {} + /** + * Creates a classical strategy. + */ + classical() : strategy_type("classical"), max_length_per_row_(0) {} void process(const Array &mtx_row_ptrs, - Array *mtx_srow) - {} + Array *mtx_srow) override + { + auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master(); + Array row_ptrs_host(host_mtx_exec); + const bool is_mtx_on_host{host_mtx_exec == + mtx_row_ptrs.get_executor()}; + const index_type *row_ptrs{}; + if (is_mtx_on_host) { + row_ptrs = mtx_row_ptrs.get_const_data(); + } else { + row_ptrs_host = mtx_row_ptrs; + row_ptrs = row_ptrs_host.get_const_data(); + } + auto num_rows = mtx_row_ptrs.get_num_elems() - 1; + max_length_per_row_ = 0; + for (index_type i = 1; i < num_rows + 1; i++) { + max_length_per_row_ = std::max(max_length_per_row_, + row_ptrs[i] - row_ptrs[i - 1]); + } + } + + int64_t clac_size(const int64_t nnz) override { return 0; } + + index_type get_max_length_per_row() const noexcept + { + return max_length_per_row_; + } + + std::shared_ptr copy() override + { + return std::make_shared(); + } - int64_t clac_size(const int64_t nnz) { return 0; } + private: + index_type max_length_per_row_; }; + /** + * merge_path is a strategy_type which uses the merge_path algorithm. + * merge_path is according to Merrill and Garland: Merge-Based Parallel + * Sparse Matrix-Vector Multiplication + */ class merge_path : public strategy_type { public: + /** + * Creates a merge_path strategy. + */ merge_path() : strategy_type("merge_path") {} void process(const Array &mtx_row_ptrs, - Array *mtx_srow) + Array *mtx_srow) override {} - int64_t clac_size(const int64_t nnz) { return 0; } + int64_t clac_size(const int64_t nnz) override { return 0; } + + std::shared_ptr copy() override + { + return std::make_shared(); + } }; + /** + * cusparse is a strategy_type which uses the sparselib csr. + * + * @note cusparse is also known to the hip executor which converts between + * cuda and hip. + */ class cusparse : public strategy_type { public: + /** + * Creates a cusparse strategy. + */ cusparse() : strategy_type("cusparse") {} void process(const Array &mtx_row_ptrs, - Array *mtx_srow) + Array *mtx_srow) override {} - int64_t clac_size(const int64_t nnz) { return 0; } + int64_t clac_size(const int64_t nnz) override { return 0; } + + std::shared_ptr copy() override + { + return std::make_shared(); + } }; + /** + * sparselib is a strategy_type which uses the sparselib csr. + * + * @note Uses cusparse in cuda and hipsparse in hip. + */ + class sparselib : public strategy_type { + public: + /** + * Creates a sparselib strategy. + */ + sparselib() : strategy_type("sparselib") {} + + void process(const Array &mtx_row_ptrs, + Array *mtx_srow) override + {} + + int64_t clac_size(const int64_t nnz) override { return 0; } + + std::shared_ptr copy() override + { + return std::make_shared(); + } + }; + + /** + * load_balance is a strategy_type which uses the load balance algorithm. + */ class load_balance : public strategy_type { public: + /** + * Creates a load_balance strategy. + */ load_balance() : load_balance(std::move( gko::CudaExecutor::create(0, gko::OmpExecutor::create()))) {} + /** + * Creates a load_balance strategy with CUDA executor. + * + * @param exec the CUDA executor + */ load_balance(std::shared_ptr exec) - : load_balance(exec->get_num_warps()) + : load_balance(exec->get_num_warps(), exec->get_warp_size()) + {} + + /** + * Creates a load_balance strategy with HIP executor. + * + * @param exec the HIP executor + */ + load_balance(std::shared_ptr exec) + : load_balance(exec->get_num_warps(), exec->get_warp_size(), false) {} - load_balance(int64_t nwarps) - : strategy_type("load_balance"), nwarps_(nwarps) + /** + * Creates a load_balance strategy with specified parameters + * + * @param nwarps the number of warps in the executor + * @param warp_size the warp size of the executor + * @param cuda_strategy whether the `cuda_strategy` needs to be used. + * + * @note The warp_size must be the size of full warp. When using this + * constructor, set_strategy needs to be called with correct + * parameters which is replaced during the conversion. + */ + load_balance(int64_t nwarps, int warp_size = 32, + bool cuda_strategy = true) + : strategy_type("load_balance"), + nwarps_(nwarps), + warp_size_(warp_size), + cuda_strategy_(cuda_strategy) {} void process(const Array &mtx_row_ptrs, - Array *mtx_srow) + Array *mtx_srow) override { - constexpr uint32 warp_size = 32; auto nwarps = mtx_srow->get_num_elems(); if (nwarps > 0) { @@ -212,8 +411,8 @@ class Csr : public EnableLinOp>, const auto num_elems = row_ptrs[num_rows]; for (size_type i = 0; i < num_rows; i++) { auto bucket = - ceildiv((ceildiv(row_ptrs[i + 1], warp_size) * nwarps), - ceildiv(num_elems, warp_size)); + ceildiv((ceildiv(row_ptrs[i + 1], warp_size_) * nwarps), + ceildiv(num_elems, warp_size_)); if (bucket < nwarps) { srow[bucket]++; } @@ -228,44 +427,121 @@ class Csr : public EnableLinOp>, } } - int64_t clac_size(const int64_t nnz) + int64_t clac_size(const int64_t nnz) override { - constexpr uint32 warp_size = 32; - int multiple = 8; - if (nnz >= 2000000) { - multiple = 128; - } else if (nnz >= 200000) { - multiple = 32; + if (warp_size_ > 0) { + int multiple = 8; + if (nnz >= 2e6) { + multiple = 128; + } else if (nnz >= 2e5) { + multiple = 32; + } + +#if GINKGO_HIP_PLATFORM_HCC + if (!cuda_strategy_) { + multiple = 8; + if (nnz >= 1e7) { + multiple = 64; + } else if (nnz >= 1e6) { + multiple = 16; + } + } +#endif // GINKGO_HIP_PLATFORM_HCC + + auto nwarps = nwarps_ * multiple; + return min(ceildiv(nnz, warp_size_), int64_t(nwarps)); + } else { + return 0; } - auto nwarps = nwarps_ * multiple; - return min(ceildiv(nnz, warp_size), static_cast(nwarps)); + } + + std::shared_ptr copy() override + { + return std::make_shared(nwarps_, warp_size_, + cuda_strategy_); } private: int64_t nwarps_; + int warp_size_; + bool cuda_strategy_; }; class automatical : public strategy_type { public: + /* Use imbalance strategy when the maximum number of nonzero per row is + * more than 1024 on NVIDIA hardware */ + const index_type nvidia_row_len_limit = 1024; + /* Use imbalance strategy when the matrix has more more than 1e6 on + * NVIDIA hardware */ + const index_type nvidia_nnz_limit = 1e6; + /* Use imbalance strategy when the maximum number of nonzero per row is + * more than 768 on AMD hardware */ + const index_type amd_row_len_limit = 768; + /* Use imbalance strategy when the matrix has more more than 1e8 on AMD + * hardware */ + const index_type amd_nnz_limit = 1e8; + + /** + * Creates an automatical strategy. + */ automatical() : automatical(std::move( gko::CudaExecutor::create(0, gko::OmpExecutor::create()))) {} + /** + * Creates an automatical strategy with CUDA executor. + * + * @param exec the CUDA executor + */ automatical(std::shared_ptr exec) - : automatical(exec->get_num_warps()) + : automatical(exec->get_num_warps(), exec->get_warp_size()) + {} + + /** + * Creates an automatical strategy with HIP executor. + * + * @param exec the HIP executor + */ + automatical(std::shared_ptr exec) + : automatical(exec->get_num_warps(), exec->get_warp_size(), false) {} - automatical(int64_t nwarps) - : strategy_type("automatical"), nwarps_(nwarps) + /** + * Creates an automatical strategy with specified parameters + * + * @param nwarps the number of warps in the executor + * @param warp_size the warp size of the executor + * @param cuda_strategy whether the `cuda_strategy` needs to be used. + * + * @note The warp_size must be the size of full warp. When using this + * constructor, set_strategy needs to be called with correct + * parameters which is replaced during the conversion. + */ + automatical(int64_t nwarps, int warp_size = 32, + bool cuda_strategy = true) + : strategy_type("automatical"), + nwarps_(nwarps), + warp_size_(warp_size), + cuda_strategy_(cuda_strategy), + max_length_per_row_(0) {} void process(const Array &mtx_row_ptrs, - Array *mtx_srow) + Array *mtx_srow) override { - // if the number of stored elements is larger than 1e6 or + // if the number of stored elements is larger than or // the maximum number of stored elements per row is larger than - // 64, use load_balance otherwise use classical + // , use load_balance otherwise use classical + index_type nnz_limit = nvidia_nnz_limit; + index_type row_len_limit = nvidia_row_len_limit; +#if GINKGO_HIP_PLATFORM_HCC + if (!cuda_strategy_) { + nnz_limit = amd_nnz_limit; + row_len_limit = amd_row_len_limit; + } +#endif // GINKGO_HIP_PLATFORM_HCC auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master(); const bool is_mtx_on_host{host_mtx_exec == mtx_row_ptrs.get_executor()}; @@ -278,8 +554,9 @@ class Csr : public EnableLinOp>, row_ptrs = row_ptrs_host.get_const_data(); } const auto num_rows = mtx_row_ptrs.get_num_elems() - 1; - if (row_ptrs[num_rows] > index_type(1e6)) { - load_balance actual_strategy(nwarps_); + if (row_ptrs[num_rows] > nnz_limit) { + load_balance actual_strategy(nwarps_, warp_size_, + cuda_strategy_); if (is_mtx_on_host) { actual_strategy.process(mtx_row_ptrs, mtx_srow); } else { @@ -291,8 +568,9 @@ class Csr : public EnableLinOp>, for (index_type i = 1; i < num_rows + 1; i++) { maxnum = max(maxnum, row_ptrs[i] - row_ptrs[i - 1]); } - if (maxnum > 64) { - load_balance actual_strategy(nwarps_); + if (maxnum > row_len_limit) { + load_balance actual_strategy(nwarps_, warp_size_, + cuda_strategy_); if (is_mtx_on_host) { actual_strategy.process(mtx_row_ptrs, mtx_srow); } else { @@ -303,23 +581,75 @@ class Csr : public EnableLinOp>, classical actual_strategy; if (is_mtx_on_host) { actual_strategy.process(mtx_row_ptrs, mtx_srow); + max_length_per_row_ = + actual_strategy.get_max_length_per_row(); } else { actual_strategy.process(row_ptrs_host, mtx_srow); + max_length_per_row_ = + actual_strategy.get_max_length_per_row(); } this->set_name(actual_strategy.get_name()); } } } - int64_t clac_size(const int64_t nnz) + int64_t clac_size(const int64_t nnz) override + { + return std::make_shared(nwarps_, warp_size_, + cuda_strategy_) + ->clac_size(nnz); + } + + index_type get_max_length_per_row() const noexcept { - return std::make_shared(nwarps_)->clac_size(nnz); + return max_length_per_row_; + } + + std::shared_ptr copy() override + { + return std::make_shared(nwarps_, warp_size_, + cuda_strategy_); } private: int64_t nwarps_; + int warp_size_; + bool cuda_strategy_; + index_type max_length_per_row_; }; + void convert_to(Csr *result) const override + { + bool same_executor = this->get_executor() == result->get_executor(); + // NOTE: as soon as strategies are improved, this can be reverted + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->srow_ = this->srow_; + result->set_size(this->get_size()); + if (!same_executor) { + convert_strategy_helper(result); + } else { + result->set_strategy(std::move(this->get_strategy()->copy())); + } + // END NOTE + } + + void move_to(Csr *result) override + { + bool same_executor = this->get_executor() == result->get_executor(); + EnableLinOp::move_to(result); + if (!same_executor) { + detail::strategy_rebuild_helper(result); + } + } + friend class Csr, IndexType>; + + void convert_to( + Csr, IndexType> *result) const override; + + void move_to(Csr, IndexType> *result) override; + void convert_to(Dense *other) const override; void move_to(Dense *other) override; @@ -352,6 +682,18 @@ class Csr : public EnableLinOp>, std::unique_ptr conj_transpose() const override; + std::unique_ptr row_permute( + const Array *permutation_indices) const override; + + std::unique_ptr column_permute( + const Array *permutation_indices) const override; + + std::unique_ptr inverse_row_permute( + const Array *inverse_permutation_indices) const override; + + std::unique_ptr inverse_column_permute( + const Array *inverse_permutation_indices) const override; + /** * Sorts all (value, col_idx) pairs in each row by column index */ @@ -470,6 +812,17 @@ class Csr : public EnableLinOp>, return strategy_; } + /** + * Set the strategy + * + * @param strategy the csr strategy + */ + void set_strategy(std::shared_ptr strategy) + { + strategy_ = std::move(strategy->copy()); + this->make_srow(); + } + protected: /** * Creates an uninitialized CSR matrix of the specified size. @@ -492,14 +845,13 @@ class Csr : public EnableLinOp>, */ Csr(std::shared_ptr exec, const dim<2> &size = dim<2>{}, size_type num_nonzeros = {}, - std::shared_ptr strategy = std::make_shared()) + std::shared_ptr strategy = std::make_shared()) : EnableLinOp(exec, size), values_(exec, num_nonzeros), col_idxs_(exec, num_nonzeros), - // avoid allocation for empty matrix - row_ptrs_(exec, size[0] + (size[0] > 0)), + row_ptrs_(exec, size[0] + 1), srow_(exec, strategy->clac_size(num_nonzeros)), - strategy_(std::move(strategy)) + strategy_(strategy->copy()) {} /** @@ -526,13 +878,13 @@ class Csr : public EnableLinOp>, typename RowPtrsArray> Csr(std::shared_ptr exec, const dim<2> &size, ValuesArray &&values, ColIdxsArray &&col_idxs, RowPtrsArray &&row_ptrs, - std::shared_ptr strategy = std::make_shared()) + std::shared_ptr strategy = std::make_shared()) : EnableLinOp(exec, size), values_{exec, std::forward(values)}, col_idxs_{exec, std::forward(col_idxs)}, row_ptrs_{exec, std::forward(row_ptrs)}, srow_(exec), - strategy_(std::move(strategy)) + strategy_(strategy->copy()) { GKO_ASSERT_EQ(values_.get_num_elems(), col_idxs_.get_num_elems()); GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems()); @@ -544,6 +896,90 @@ class Csr : public EnableLinOp>, void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, LinOp *x) const override; + // TODO clean this up as soon as we improve strategy_type + template + void convert_strategy_helper(CsrType *result) const + { + auto strat = this->get_strategy().get(); + std::shared_ptr new_strat; + if (dynamic_cast(strat)) { + new_strat = std::make_shared(); + } else if (dynamic_cast(strat)) { + new_strat = std::make_shared(); + } else if (dynamic_cast(strat)) { + new_strat = std::make_shared(); + } else if (dynamic_cast(strat)) { + new_strat = std::make_shared(); + } else { + auto rexec = result->get_executor(); + auto cuda_exec = + std::dynamic_pointer_cast(rexec); + auto hip_exec = std::dynamic_pointer_cast(rexec); + auto lb = dynamic_cast(strat); + if (cuda_exec) { + if (lb) { + new_strat = + std::make_shared( + cuda_exec); + } else { + new_strat = std::make_shared( + cuda_exec); + } + } else if (hip_exec) { + if (lb) { + new_strat = + std::make_shared( + hip_exec); + } else { + new_strat = std::make_shared( + hip_exec); + } + } else { + // Try to preserve this executor's configuration + auto this_cuda_exec = + std::dynamic_pointer_cast( + this->get_executor()); + auto this_hip_exec = + std::dynamic_pointer_cast( + this->get_executor()); + if (this_cuda_exec) { + if (lb) { + new_strat = + std::make_shared( + this_cuda_exec); + } else { + new_strat = + std::make_shared( + this_cuda_exec); + } + } else if (this_hip_exec) { + if (lb) { + new_strat = + std::make_shared( + this_hip_exec); + } else { + new_strat = + std::make_shared( + this_hip_exec); + } + } else { + // We had a load balance or automatical strategy from a non + // HIP or Cuda executor and are moving to a non HIP or Cuda + // executor. + // FIXME this creates a long delay + if (lb) { + new_strat = + std::make_shared(); + } else { + new_strat = + std::make_shared(); + } + } + } + } + result->set_strategy(new_strat); + } + /** * Computes srow. It should be run after changing any row_ptrs_ value. */ @@ -562,6 +998,43 @@ class Csr : public EnableLinOp>, }; +namespace detail { + + +/** + * When strategy is load_balance or automatical, rebuild the strategy + * according to executor's property. + * + * @param result the csr matrix. + */ +template +void strategy_rebuild_helper(Csr *result) +{ + using load_balance = typename Csr::load_balance; + using automatical = typename Csr::automatical; + auto strategy = result->get_strategy(); + auto executor = result->get_executor(); + if (std::dynamic_pointer_cast(strategy)) { + if (auto exec = + std::dynamic_pointer_cast(executor)) { + result->set_strategy(std::make_shared(exec)); + } else if (auto exec = std::dynamic_pointer_cast( + executor)) { + result->set_strategy(std::make_shared(exec)); + } + } else if (std::dynamic_pointer_cast(strategy)) { + if (auto exec = + std::dynamic_pointer_cast(executor)) { + result->set_strategy(std::make_shared(exec)); + } else if (auto exec = std::dynamic_pointer_cast( + executor)) { + result->set_strategy(std::make_shared(exec)); + } + } +} + + +} // namespace detail } // namespace matrix } // namespace gko diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index ab0ae4fc9dd..a98861f75f2 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_DENSE_HPP_ +#include + + #include #include #include @@ -43,9 +46,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - namespace gko { namespace matrix { @@ -87,6 +87,7 @@ class SparsityCsr; template class Dense : public EnableLinOp>, public EnableCreateMethod>, + public ConvertibleTo>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -103,7 +104,9 @@ class Dense : public EnableLinOp>, public ReadableFromMatrixData, public WritableToMatrixData, public WritableToMatrixData, - public Transposable { + public Transposable, + public Permutable, + public Permutable { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Coo; @@ -125,6 +128,7 @@ class Dense : public EnableLinOp>, using value_type = ValueType; using index_type = int64; + using transposed_type = Dense; using mat_data = gko::matrix_data; using mat_data32 = gko::matrix_data; @@ -141,11 +145,15 @@ class Dense : public EnableLinOp>, // using operator `->`) is currently required to be compatible with // CUDA 10.1. // Otherwise, it results in a compile error. - // TODO Check if the compiler error is fixed and revert to `operator->`. - return Dense::create((*other).get_executor(), (*other).get_size(), - (*other).get_stride()); + return (*other).create_with_same_config(); } + friend class Dense>; + + void convert_to(Dense> *result) const override; + + void move_to(Dense> *result) override; + void convert_to(Coo *result) const override; void move_to(Coo *result) override; @@ -206,6 +214,31 @@ class Dense : public EnableLinOp>, std::unique_ptr conj_transpose() const override; + std::unique_ptr row_permute( + const Array *permutation_indices) const override; + + std::unique_ptr row_permute( + const Array *permutation_indices) const override; + + std::unique_ptr column_permute( + const Array *permutation_indices) const override; + + std::unique_ptr column_permute( + const Array *permutation_indices) const override; + + std::unique_ptr inverse_row_permute( + const Array *inverse_permutation_indices) const override; + + std::unique_ptr inverse_row_permute( + const Array *inverse_permutation_indices) const override; + + std::unique_ptr inverse_column_permute( + const Array *inverse_permutation_indices) const override; + + std::unique_ptr inverse_column_permute( + const Array *inverse_permutation_indices) const override; + + /** * Returns a pointer to the array of values of the matrix. * @@ -448,6 +481,17 @@ class Dense : public EnableLinOp>, values_.get_num_elems()); } + /** + * Creates a Dense matrix with the same configuration as the callers matrix. + * + * @returns a Dense matrix with the same configuration as the caller. + */ + virtual std::unique_ptr create_with_same_config() const + { + return Dense::create(this->get_executor(), this->get_size(), + this->get_stride()); + } + /** * @copydoc scale(const LinOp *) * diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index d9572ec0759..237ce920799 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -70,6 +70,7 @@ class Csr; template class Ell : public EnableLinOp>, public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ReadableFromMatrixData, @@ -87,6 +88,13 @@ class Ell : public EnableLinOp>, using index_type = IndexType; using mat_data = matrix_data; + friend class Ell, IndexType>; + + void convert_to( + Ell, IndexType> *result) const override; + + void move_to(Ell, IndexType> *result) override; + void convert_to(Dense *other) const override; void move_to(Dense *other) override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index a759e0be7b2..9c3cb7bda33 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -68,12 +68,14 @@ class Csr; * @ingroup LinOp */ template -class Hybrid : public EnableLinOp>, - public EnableCreateMethod>, - public ConvertibleTo>, - public ConvertibleTo>, - public ReadableFromMatrixData, - public WritableToMatrixData { +class Hybrid + : public EnableLinOp>, + public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, + public ConvertibleTo>, + public ConvertibleTo>, + public ReadableFromMatrixData, + public WritableToMatrixData { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Dense; @@ -239,6 +241,9 @@ class Hybrid : public EnableLinOp>, { auto row_nnz_val = row_nnz->get_data(); auto num_rows = row_nnz->get_num_elems(); + if (num_rows == 0) { + return 0; + } std::sort(row_nnz_val, row_nnz_val + num_rows); if (percent_ < 1) { auto percent_pos = static_cast(num_rows * percent_); @@ -253,7 +258,7 @@ class Hybrid : public EnableLinOp>, }; /** - * imbalance_bounded_limit is a stratgy_type which decides the number of + * imbalance_bounded_limit is a strategy_type which decides the number of * stored elements per row of the ell part. It uses the imbalance_limit and * adds the upper bound of the number of ell's cols by the number of rows. */ @@ -283,7 +288,7 @@ class Hybrid : public EnableLinOp>, /** - * minimal_storage_limit is a stratgy_type which decides the number of + * minimal_storage_limit is a strategy_type which decides the number of * stored elements per row of the ell part. It is determined by the size of * ValueType and IndexType, the storage is the minimum among all partition. */ @@ -310,7 +315,7 @@ class Hybrid : public EnableLinOp>, /** - * automatic is a stratgy_type which decides the number of stored elements + * automatic is a strategy_type which decides the number of stored elements * per row of the ell part automatically. */ class automatic : public strategy_type { @@ -330,6 +335,13 @@ class Hybrid : public EnableLinOp>, imbalance_bounded_limit strategy_; }; + friend class Hybrid, IndexType>; + + void convert_to( + Hybrid, IndexType> *result) const override; + + void move_to(Hybrid, IndexType> *result) override; + void convert_to(Dense *other) const override; void move_to(Dense *other) override; diff --git a/include/ginkgo/core/matrix/identity.hpp b/include/ginkgo/core/matrix/identity.hpp index c79a2389c7a..4cc9065d92f 100644 --- a/include/ginkgo/core/matrix/identity.hpp +++ b/include/ginkgo/core/matrix/identity.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -61,7 +61,8 @@ namespace matrix { */ template class Identity : public EnableLinOp>, - public EnableCreateMethod> { + public EnableCreateMethod>, + public Transposable { friend class EnablePolymorphicObject; friend class EnableCreateMethod; @@ -70,6 +71,12 @@ class Identity : public EnableLinOp>, using EnableLinOp::move_to; using value_type = ValueType; + using transposed_type = Identity; + + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + protected: /** diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp new file mode 100644 index 00000000000..656cee4ae3d --- /dev/null +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -0,0 +1,249 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_PERMUTATION_HPP_ +#define GKO_CORE_MATRIX_PERMUTATION_HPP_ + + +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace matrix { + +/** @internal std::bitset allows to store any number of bits */ +using mask_type = gko::uint64; + +static constexpr mask_type row_permute = mask_type{1}; +static constexpr mask_type column_permute = mask_type{1 << 2}; +static constexpr mask_type inverse_permute = mask_type{1 << 3}; + +/** + * Permutation is a matrix "format" which stores the row and column permutation + * arrays which can be used for re-ordering the rows and columns a matrix. + * + * @tparam IndexType precision of permutation array indices. + * + * @note This format is used mainly to allow for an abstraction of the + * permutation/re-ordering and provides the user with an apply method which + * calls the respective LinOp's permute operation if the respective LinOp + * implements the Permutable interface. As such it only stores an array of the + * permutation indices. + * + * @ingroup permutation + * @ingroup mat_formats + * @ingroup LinOp + */ +template +class Permutation : public EnableLinOp>, + public EnableCreateMethod> { + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + +public: + using index_type = IndexType; + + /** + * Returns a pointer to the array of permutation. + * + * @return the pointer to the row permutation array. + */ + index_type *get_permutation() noexcept { return permutation_.get_data(); } + + /** + * @copydoc get_permutation() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const index_type *get_const_permutation() const noexcept + { + return permutation_.get_const_data(); + } + + /** + * Returns the number of elements explicitly stored in the permutation + * array. + * + * @return the number of elements explicitly stored in the permutation + * array. + */ + size_type get_permutation_size() const noexcept + { + return permutation_.get_num_elems(); + } + + /** + * Get the permute masks + * + * @return permute_mask the permute masks + */ + mask_type get_permute_mask() const { return enabled_permute_; } + + /** + * Set the permute masks + * + * @param permute_mask the permute masks + */ + void set_permute_mask(mask_type permute_mask) + { + enabled_permute_ = permute_mask; + } + + +protected: + /** + * Creates an uninitialized Permutation arrays on the specified executor.. + * + * @param exec Executor associated to the LinOp + */ + Permutation(std::shared_ptr exec) + : Permutation(std::move(exec), dim<2>{}) + {} + + /** + * Creates uninitialized Permutation arrays of the specified size. + * + * @param exec Executor associated to the matrix + * @param size size of the permutable matrix + * @param enabled_permute mask for the type of permutation to apply. + */ + Permutation(std::shared_ptr exec, const dim<2> &size, + const mask_type &enabled_permute = row_permute) + : EnableLinOp(exec, size), + permutation_(exec, size[0]), + row_size_(size[0]), + col_size_(size[1]), + enabled_permute_(enabled_permute) + {} + + /** + * Creates a Permutation matrix from an already allocated (and initialized) + * row and column permutation arrays. + * + * @tparam IndicesArray type of array of indices + * + * @param exec Executor associated to the matrix + * @param size size of the permutation array. + * @param permutation_indices array of permutation array + * @param enabled_permute mask for the type of permutation to apply. + * + * @note If `permutation_indices` is not an rvalue, not an array of + * IndexType, or is on the wrong executor, an internal copy will be created, + * and the original array data will not be used in the matrix. + */ + template + Permutation(std::shared_ptr exec, const dim<2> &size, + IndicesArray &&permutation_indices, + const mask_type &enabled_permute = row_permute) + : EnableLinOp(exec, size), + permutation_{exec, std::forward(permutation_indices)}, + row_size_(size[0]), + col_size_(size[1]), + enabled_permute_(enabled_permute) + { + if (enabled_permute_ & row_permute) { + GKO_ASSERT_EQ(size[0], permutation_.get_num_elems()); + } + if (enabled_permute_ & column_permute) { + GKO_ASSERT_EQ(size[1], permutation_.get_num_elems()); + } + } + + void apply_impl(const LinOp *in, LinOp *out) const + { + auto perm = as>(in); + std::unique_ptr tmp{}; + if (enabled_permute_ & inverse_permute) { + if (enabled_permute_ & row_permute) { + tmp = perm->inverse_row_permute(&permutation_); + } + if (enabled_permute_ & column_permute) { + if (enabled_permute_ & row_permute) { + tmp = as>(tmp.get()) + ->inverse_column_permute(&permutation_); + } else { + tmp = perm->inverse_column_permute(&permutation_); + } + } + } else { + if (enabled_permute_ & row_permute) { + tmp = perm->row_permute(&permutation_); + } + if (enabled_permute_ & column_permute) { + if (enabled_permute_ & row_permute) { + tmp = as>(tmp.get())->column_permute( + &permutation_); + } else { + tmp = perm->column_permute(&permutation_); + } + } + } + out->copy_from(std::move(tmp)); + } + + + void apply_impl(const LinOp *, const LinOp *in, const LinOp *, + LinOp *out) const + { + // Ignores alpha and beta and just performs a normal permutation as an + // advanced apply does not really make sense here. + this->apply_impl(in, out); + } + + +private: + Array permutation_; + size_type row_size_; + size_type col_size_; + mask_type enabled_permute_; +}; + + +} // namespace matrix +} // namespace gko + + +#endif // GKO_CORE_MATRIX_PERMUTATION_HPP_ diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 59423f33fd4..021b3870885 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -67,6 +67,7 @@ class Csr; template class Sellp : public EnableLinOp>, public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ReadableFromMatrixData, @@ -84,6 +85,13 @@ class Sellp : public EnableLinOp>, using index_type = IndexType; using mat_data = matrix_data; + friend class Sellp, IndexType>; + + void convert_to( + Sellp, IndexType> *result) const override; + + void move_to(Sellp, IndexType> *result) override; + void convert_to(Dense *other) const override; void move_to(Dense *other) override; @@ -305,10 +313,8 @@ class Sellp : public EnableLinOp>, : EnableLinOp(exec, size), values_(exec, slice_size * total_cols), col_idxs_(exec, slice_size * total_cols), - slice_lengths_(exec, - (size[0] == 0) ? 0 : ceildiv(size[0], slice_size)), - slice_sets_(exec, - (size[0] == 0) ? 0 : ceildiv(size[0], slice_size) + 1), + slice_lengths_(exec, ceildiv(size[0], slice_size)), + slice_sets_(exec, ceildiv(size[0], slice_size) + 1), slice_size_(slice_size), stride_factor_(stride_factor), total_cols_(total_cols) diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp index 5d1b0e580a8..25374a251b8 100644 --- a/include/ginkgo/core/matrix/sparsity_csr.hpp +++ b/include/ginkgo/core/matrix/sparsity_csr.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -84,7 +84,7 @@ class SparsityCsr using value_type = ValueType; using index_type = IndexType; - + using transposed_type = SparsityCsr; using mat_data = matrix_data; void read(const mat_data &data) override; @@ -198,17 +198,9 @@ class SparsityCsr const dim<2> &size = dim<2>{}, size_type num_nonzeros = {}) : EnableLinOp(exec, size), col_idxs_(exec, num_nonzeros), - // avoid allocation for empty matrix - row_ptrs_(exec, size[0] + (size[0] > 0)) - { - if (size[0] > 0) { - auto tmp = Array{exec->get_master(), 1}; - tmp.get_data()[0] = one(); - value_ = Array{exec, std::move(tmp)}; - } else { - value_ = Array{exec}; - } - } + row_ptrs_(exec, size[0] + 1), + value_(exec, {one()}) + {} /** * Creates a SparsityCsr matrix from already allocated (and initialized) row @@ -235,11 +227,9 @@ class SparsityCsr value_type value = one()) : EnableLinOp(exec, size), col_idxs_{exec, std::forward(col_idxs)}, - row_ptrs_{exec, std::forward(row_ptrs)} + row_ptrs_{exec, std::forward(row_ptrs)}, + value_{exec, {value}} { - auto tmp = Array{exec->get_master(), 1}; - tmp.get_data()[0] = value; - value_ = Array{exec, std::move(tmp)}; GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems()); } diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 6b41cc176da..d3faaab5b4d 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include namespace gko { @@ -109,8 +109,10 @@ namespace preconditioner { */ template , typename USolverType = solver::UpperTrs<>, bool ReverseApply = false, - typename IndexTypeParIlu = int32> -class Ilu : public EnableLinOp> { + typename IndexType = int32> +class Ilu : public EnableLinOp< + Ilu>, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; @@ -123,7 +125,10 @@ class Ilu : public EnableLinOp> { using l_solver_type = LSolverType; using u_solver_type = USolverType; static constexpr bool performs_reverse_apply = ReverseApply; - using index_type_par_ilu = IndexTypeParIlu; + using index_type = IndexType; + using transposed_type = + Ilu; GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { @@ -138,6 +143,12 @@ class Ilu : public EnableLinOp> { */ std::shared_ptr GKO_FACTORY_PARAMETER( u_solver_factory, nullptr); + + /** + * Factory for the factorization + */ + std::shared_ptr GKO_FACTORY_PARAMETER( + factorization_factory, nullptr); }; GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory); @@ -163,15 +174,47 @@ class Ilu : public EnableLinOp> { return u_solver_; } + std::unique_ptr transpose() const override + { + std::unique_ptr transposed{ + new transposed_type{this->get_executor()}}; + transposed->set_size(gko::transpose(this->get_size())); + transposed->l_solver_ = + share(as( + this->get_u_solver()->transpose())); + transposed->u_solver_ = + share(as( + this->get_l_solver()->transpose())); + + return std::move(transposed); + } + + std::unique_ptr conj_transpose() const override + { + std::unique_ptr transposed{ + new transposed_type{this->get_executor()}}; + transposed->set_size(gko::transpose(this->get_size())); + transposed->l_solver_ = + share(as( + this->get_u_solver()->conj_transpose())); + transposed->u_solver_ = + share(as( + this->get_l_solver()->conj_transpose())); + + return std::move(transposed); + } + protected: void apply_impl(const LinOp *b, LinOp *x) const override { set_cache_to(b); if (!ReverseApply) { l_solver_->apply(b, cache_.intermediate.get()); + x->copy_from(cache_.intermediate.get()); u_solver_->apply(cache_.intermediate.get(), x); } else { u_solver_->apply(b, cache_.intermediate.get()); + x->copy_from(cache_.intermediate.get()); l_solver_->apply(cache_.intermediate.get(), x); } } @@ -197,24 +240,33 @@ class Ilu : public EnableLinOp> { : EnableLinOp(factory->get_executor(), lin_op->get_size()), parameters_{factory->get_parameters()} { - auto comp_cast = - dynamic_cast *>(lin_op.get()); + auto comp = + std::dynamic_pointer_cast>(lin_op); std::shared_ptr l_factor; std::shared_ptr u_factor; - if (comp_cast == nullptr) { + // build factorization if we weren't passed a composition + if (!comp) { auto exec = lin_op->get_executor(); - auto par_ilu = - factorization::ParIlu::build() - .on(exec) - ->generate(lin_op); - l_factor = par_ilu->get_l_factor(); - u_factor = par_ilu->get_u_factor(); - } else if (comp_cast->get_operators().size() == 2) { - l_factor = comp_cast->get_operators()[0]; - u_factor = comp_cast->get_operators()[1]; + if (!parameters_.factorization_factory) { + parameters_.factorization_factory = + factorization::ParIlu::build().on( + exec); + } + auto fact = std::shared_ptr( + parameters_.factorization_factory->generate(lin_op)); + // ensure that the result is a composition + comp = + std::dynamic_pointer_cast>(fact); + if (!comp) { + GKO_NOT_SUPPORTED(comp); + } + } + if (comp->get_operators().size() == 2) { + l_factor = comp->get_operators()[0]; + u_factor = comp->get_operators()[1]; } else { - GKO_NOT_SUPPORTED(comp_cast); + GKO_NOT_SUPPORTED(comp); } GKO_ASSERT_EQUAL_DIMENSIONS(l_factor, u_factor); @@ -276,8 +328,7 @@ class Ilu : public EnableLinOp> { * */ template - struct has_with_criteria : std::false_type { - }; + struct has_with_criteria : std::false_type {}; /** * @copydoc has_with_criteria @@ -291,8 +342,7 @@ class Ilu : public EnableLinOp> { SolverType, xstd::void_t>() .with_criteria(with_criteria_param_type()))>> - : std::true_type { - }; + : std::true_type {}; /** @@ -308,7 +358,7 @@ class Ilu : public EnableLinOp> { generate_default_solver(const std::shared_ptr &exec, const std::shared_ptr &mtx) { - constexpr value_type default_reduce_residual{1e-4}; + constexpr gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; @@ -316,7 +366,7 @@ class Ilu : public EnableLinOp> { .with_criteria(gko::stop::Iteration::build() .with_max_iters(default_max_iters) .on(exec), - gko::stop::ResidualNormReduction<>::build() + gko::stop::ResidualNormReduction::build() .with_reduction_factor(default_reduce_residual) .on(exec)) .on(exec) diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp new file mode 100644 index 00000000000..f7c6d8a701c --- /dev/null +++ b/include/ginkgo/core/preconditioner/isai.hpp @@ -0,0 +1,218 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_PRECONDITIONER_ISAI_HPP_ +#define GKO_CORE_PRECONDITIONER_ISAI_HPP_ + + +#include + + +#include +#include +#include +#include +#include +#include + + +namespace gko { +/** + * @brief The Preconditioner namespace. + * + * @ingroup precond + */ +namespace preconditioner { + + +/** + * This enum lists the types of the ISAI preconditioner. + * + * ISAI can either generate a lower triangular matrix, or an upper triangular + * matrix. + */ +enum struct isai_type { lower, upper }; + +/** + * The Incomplete Sparse Approximate Inverse (ISAI) Preconditioner generates + * an approximate inverse matrix for a given lower triangular matrix L or upper + * triangular matrix U. + * + * Using the preconditioner computes $aiU * x$ or $aiL * x$ (depending on the + * type of the Isai) for a given vector x (may have multiple right hand sides). + * aiU and aiL are the approximate inverses for U and L respectively. + * + * The sparsity pattern used for the approximate inverse is the same as + * the sparsity pattern of the respective triangular matrix. + * + * For more details on the algorithm, see the paper + * + * Incomplete Sparse Approximate Inverses for Parallel Preconditioning, + * which is the basis for this work. + * + * @note GPU implementations can only handle the vector unit width `width` + * (warp size for CUDA) as number of elements per row in the sparse + * matrix. If there are more than `width` elements per row, the remaining + * elements will be ignored. + * + * @tparam IsaiType determines if the ISAI is generated for a lower triangular + * matrix or an upper triangular matrix + * @tparam ValueType precision of matrix elements + * @tparam IndexType precision of matrix indexes + * + * @ingroup isai + * @ingroup precond + * @ingroup LinOp + */ +template +class Isai : public EnableLinOp>, + public Transposable { + friend class EnableLinOp; + friend class EnablePolymorphicObject; + friend class Isai; + +public: + using value_type = ValueType; + using index_type = IndexType; + using transposed_type = + Isai; + using Csr = matrix::Csr; + static constexpr isai_type type{IsaiType}; + + /** + * Returns the approximate inverse of the given matrix (either L or U, + * depending on the template parameter IsaiType). + * + * @returns the generated approximate inverse + */ + std::shared_ptr get_approximate_inverse() const + { + return approximate_inverse_; + } + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * @brief Optimization parameter that skips the sorting of the input + * matrix (only skip if it is known that it is already sorted). + * + * The algorithm to create the approximate inverses requires the + * input matrix to be sorted. If it is, this parameter can be set to + * `true` to skip the sorting for better performance. + */ + bool GKO_FACTORY_PARAMETER(skip_sorting, false); + + /** + * @brief Which power of the input matrix should be used for the + * sparsity pattern. + * + * The algorithm symbolically computes M^n and uses this sparsity + * pattern for the sparse inverse. + * Must be at least 1, default value 1. + */ + int GKO_FACTORY_PARAMETER(sparsity_power, 1); + }; + + GKO_ENABLE_LIN_OP_FACTORY(Isai, parameters, Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + +protected: + explicit Isai(std::shared_ptr exec) + : EnableLinOp(std::move(exec)) + {} + + /** + * Creates an Isai preconditioner from a matrix using an Isai::Factory. + * + * @param factory the factory to use to create the preconditoner + * @param factors Composition of a lower triangular and an + * upper triangular matrix (L and U) + */ + explicit Isai(const Factory *factory, + std::shared_ptr system_matrix) + : EnableLinOp(factory->get_executor(), system_matrix->get_size()), + parameters_{factory->get_parameters()} + { + const auto skip_sorting = parameters_.skip_sorting; + const auto power = parameters_.sparsity_power; + generate_inverse(system_matrix, skip_sorting, power); + } + + void apply_impl(const LinOp *b, LinOp *x) const override + { + approximate_inverse_->apply(b, x); + } + + void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, + LinOp *x) const override + { + approximate_inverse_->apply(alpha, b, beta, x); + } + +private: + /** + * Generates the approximate inverse for a triangular matrix and + * stores the result in `approximate_inverse_`. + * + * @param to_invert the source triangular matrix used to generate + * the approximate inverse + * + * @param skip_sorting dictates if the sorting of the input matrix should + * be skipped. + */ + void generate_inverse(std::shared_ptr to_invert, + bool skip_sorting, int power); + +private: + std::shared_ptr approximate_inverse_; +}; + + +template +using LowerIsai = Isai; + +template +using UpperIsai = Isai; + + +} // namespace preconditioner +} // namespace gko + + +#endif // GKO_CORE_PRECONDITIONER_ISAI_HPP_ diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp index 1c70ee0c498..fc6116f5d5e 100644 --- a/include/ginkgo/core/preconditioner/jacobi.hpp +++ b/include/ginkgo/core/preconditioner/jacobi.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -206,7 +206,8 @@ struct block_interleaved_storage_scheme { template class Jacobi : public EnableLinOp>, public ConvertibleTo>, - public WritableToMatrixData { + public WritableToMatrixData, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; @@ -216,6 +217,7 @@ class Jacobi : public EnableLinOp>, using value_type = ValueType; using index_type = IndexType; using mat_data = matrix_data; + using transposed_type = Jacobi; /** * Returns the number of blocks of the operator. @@ -287,15 +289,30 @@ class Jacobi : public EnableLinOp>, void write(mat_data &data) const override; + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** * Maximal size of diagonal blocks. * - * @note This value has to be between 1 and 32. + * @note This value has to be between 1 and 32 (NVIDIA)/64 (AMD). */ uint32 GKO_FACTORY_PARAMETER(max_block_size, 32u); + /** + * Stride between two columns of a block (as number of elements). + * + * Should be a multiple of cache line size for best performance. + * + * @note If this value is 0, it uses 64 in hip AMD but 32 in NVIDIA or + * reference executor. The allowed value: 0, 64 for AMD and 0, 32 + * for NVIDIA + */ + uint32 GKO_FACTORY_PARAMETER(max_block_stride, 0u); + /** * Starting (row / column) indexes of individual blocks. * @@ -478,31 +495,22 @@ class Jacobi : public EnableLinOp>, explicit Jacobi(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, - storage_scheme_{compute_storage_scheme(parameters_.max_block_size)}, + storage_scheme_{this->compute_storage_scheme( + parameters_.max_block_size, parameters_.max_block_stride)}, num_blocks_{parameters_.block_pointers.get_num_elems() - 1}, blocks_(factory->get_executor(), storage_scheme_.compute_storage_space( parameters_.block_pointers.get_num_elems() - 1)), conditioning_(factory->get_executor()) { - if (parameters_.max_block_size > 32 || parameters_.max_block_size < 1) { - GKO_NOT_SUPPORTED(this); - } parameters_.block_pointers.set_executor(this->get_executor()); parameters_.storage_optimization.block_wise.set_executor( this->get_executor()); this->generate(lend(system_matrix)); } - /** - * Stride between two columns of a block (as number of elements). - * - * Should be a multiple of cache line size for best performance. - */ - static constexpr size_type max_block_stride_ = 32; - /** * Computes the storage scheme suitable for storing blocks of a given * maximum size. @@ -511,11 +519,32 @@ class Jacobi : public EnableLinOp>, * * @return a suitable storage scheme */ - static block_interleaved_storage_scheme compute_storage_scheme( - uint32 max_block_size) noexcept + block_interleaved_storage_scheme compute_storage_scheme( + uint32 max_block_size, uint32 param_max_block_stride) { + uint32 default_block_stride = 32; + // If the executor is hip, the warp size is 32 or 64 + if (auto hip_exec = std::dynamic_pointer_cast( + this->get_executor())) { + default_block_stride = hip_exec->get_warp_size(); + } + uint32 max_block_stride = default_block_stride; + if (param_max_block_stride != 0) { + // if parameter max_block_stride is not zero, set max_block_stride = + // param_max_block_stride + max_block_stride = param_max_block_stride; + if (this->get_executor() != this->get_executor()->get_master() && + max_block_stride != default_block_stride) { + // only support the default value on the gpu devive + GKO_NOT_SUPPORTED(this); + } + } + if (parameters_.max_block_size > max_block_stride || + parameters_.max_block_size < 1) { + GKO_NOT_SUPPORTED(this); + } const auto group_size = static_cast( - max_block_stride_ / get_superior_power(uint32{2}, max_block_size)); + max_block_stride / get_superior_power(uint32{2}, max_block_size)); const auto block_offset = max_block_size; const auto block_stride = group_size * block_offset; const auto group_offset = max_block_size * block_stride; diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp new file mode 100644 index 00000000000..14e3cada1ce --- /dev/null +++ b/include/ginkgo/core/solver/bicg.hpp @@ -0,0 +1,190 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_SOLVER_BICG_HPP_ +#define GKO_CORE_SOLVER_BICG_HPP_ + + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace solver { + + +/** + * BICG or the Biconjugate gradient method is a Krylov subspace solver. + * + * Being a generic solver, it is capable of solving general matrices, including + * non-s.p.d matrices. Though, the memory and the computational requirement of + * the BiCG solver are higher than of its s.p.d solver counterpart, it has + * the capability to solve generic systems. BiCG is the unstable version of + * BiCGSTAB. + * + * @tparam ValueType precision of matrix elements + * + * @ingroup solvers + * @ingroup LinOp + */ +template +class Bicg : public EnableLinOp>, + public Preconditionable, + public Transposable { + friend class EnableLinOp; + friend class EnablePolymorphicObject; + +public: + using value_type = ValueType; + using transposed_type = Bicg; + + /** + * Gets the system operator (matrix) of the linear system. + * + * @return the system operator (matrix) + */ + std::shared_ptr get_system_matrix() const + { + return system_matrix_; + } + + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } + + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * Criterion factories. + */ + std::vector> + GKO_FACTORY_PARAMETER(criteria, nullptr); + + /** + * Preconditioner factory. + */ + std::shared_ptr GKO_FACTORY_PARAMETER( + preconditioner, nullptr); + + /** + * Already generated preconditioner. If one is provided, the factory + * `preconditioner` will be ignored. + */ + std::shared_ptr GKO_FACTORY_PARAMETER( + generated_preconditioner, nullptr); + }; + GKO_ENABLE_LIN_OP_FACTORY(Bicg, parameters, Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + void apply_impl(const LinOp *b, LinOp *x) const override; + + void apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, + LinOp *x) const override; + + explicit Bicg(std::shared_ptr exec) + : EnableLinOp(std::move(exec)) + {} + + explicit Bicg(const Factory *factory, + std::shared_ptr system_matrix) + : EnableLinOp(factory->get_executor(), + gko::transpose(system_matrix->get_size())), + parameters_{factory->get_parameters()}, + system_matrix_{std::move(system_matrix)} + { + if (parameters_.generated_preconditioner) { + GKO_ASSERT_EQUAL_DIMENSIONS(parameters_.generated_preconditioner, + this); + set_preconditioner(parameters_.generated_preconditioner); + } else if (parameters_.preconditioner) { + set_preconditioner( + parameters_.preconditioner->generate(system_matrix_)); + } else { + set_preconditioner(matrix::Identity::create( + this->get_executor(), this->get_size()[0])); + } + stop_criterion_factory_ = + stop::combine(std::move(parameters_.criteria)); + } + +private: + std::shared_ptr system_matrix_{}; + std::shared_ptr stop_criterion_factory_{}; +}; + + +} // namespace solver +} // namespace gko + + +#endif // GKO_CORE_SOLVER_BICG_HPP_ diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index 35464a74f09..c7264bb2fbe 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -74,12 +74,14 @@ namespace solver { */ template class Bicgstab : public EnableLinOp>, - public Preconditionable { + public Preconditionable, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; public: using value_type = ValueType; + using transposed_type = Bicgstab; /** * Gets the system operator (matrix) of the linear system. @@ -91,6 +93,39 @@ class Bicgstab : public EnableLinOp>, return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } + + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -128,7 +163,7 @@ class Bicgstab : public EnableLinOp>, explicit Bicgstab(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{std::move(system_matrix)} { @@ -157,4 +192,4 @@ class Bicgstab : public EnableLinOp>, } // namespace gko -#endif // GKO_CORE_SOLVER_BICGSTAB_HPP +#endif // GKO_CORE_SOLVER_BICGSTAB_HPP_ diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index 9d661b3b07d..7e11b4e2d2a 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -69,12 +69,15 @@ namespace solver { * @ingroup LinOp */ template -class Cg : public EnableLinOp>, public Preconditionable { +class Cg : public EnableLinOp>, + public Preconditionable, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; public: using value_type = ValueType; + using transposed_type = Cg; /** * Gets the system operator (matrix) of the linear system. @@ -86,6 +89,39 @@ class Cg : public EnableLinOp>, public Preconditionable { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } + + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -123,7 +159,7 @@ class Cg : public EnableLinOp>, public Preconditionable { explicit Cg(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{std::move(system_matrix)} { @@ -152,4 +188,4 @@ class Cg : public EnableLinOp>, public Preconditionable { } // namespace gko -#endif // GKO_CORE_SOLVER_CG_HPP +#endif // GKO_CORE_SOLVER_CG_HPP_ diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index 65609f43b24..26b9e41e276 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -66,12 +66,15 @@ namespace solver { * @ingroup LinOp */ template -class Cgs : public EnableLinOp>, public Preconditionable { +class Cgs : public EnableLinOp>, + public Preconditionable, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; public: using value_type = ValueType; + using transposed_type = Cgs; /** * Gets the system operator (matrix) of the linear system. @@ -83,6 +86,39 @@ class Cgs : public EnableLinOp>, public Preconditionable { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } + + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -120,7 +156,7 @@ class Cgs : public EnableLinOp>, public Preconditionable { explicit Cgs(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{std::move(system_matrix)} { @@ -149,4 +185,4 @@ class Cgs : public EnableLinOp>, public Preconditionable { } // namespace gko -#endif // GKO_CORE_SOLVER_CGS_HPP +#endif // GKO_CORE_SOLVER_CGS_HPP_ diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index eaec9587685..ba9d14328ee 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -74,12 +74,15 @@ namespace solver { * @ingroup LinOp */ template -class Fcg : public EnableLinOp>, public Preconditionable { +class Fcg : public EnableLinOp>, + public Preconditionable, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; public: using value_type = ValueType; + using transposed_type = Fcg; /** * Gets the system operator (matrix) of the linear system. @@ -91,6 +94,39 @@ class Fcg : public EnableLinOp>, public Preconditionable { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } + + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -128,7 +164,7 @@ class Fcg : public EnableLinOp>, public Preconditionable { explicit Fcg(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{std::move(system_matrix)} { @@ -157,4 +193,4 @@ class Fcg : public EnableLinOp>, public Preconditionable { } // namespace gko -#endif // GKO_CORE_SOLVER_FCG_HPP +#endif // GKO_CORE_SOLVER_FCG_HPP_ diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index 06ff1abd629..83396641d41 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -69,12 +69,15 @@ constexpr size_type default_krylov_dim = 100u; * @ingroup LinOp */ template -class Gmres : public EnableLinOp>, public Preconditionable { +class Gmres : public EnableLinOp>, + public Preconditionable, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; public: using value_type = ValueType; + using transposed_type = Gmres; /** * Gets the system operator (matrix) of the linear system. @@ -86,13 +89,53 @@ class Gmres : public EnableLinOp>, public Preconditionable { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } + /** - * Returns the krylov dimension. + * Gets the krylov dimension of the solver * * @return the krylov dimension */ size_type get_krylov_dim() const { return krylov_dim_; } + /** + * Sets the krylov dimension + * + * @param other the new krylov dimension + */ + void set_krylov_dim(const size_type &other) { krylov_dim_ = other; } + + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -135,7 +178,7 @@ class Gmres : public EnableLinOp>, public Preconditionable { explicit Gmres(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{std::move(system_matrix)} { @@ -170,4 +213,4 @@ class Gmres : public EnableLinOp>, public Preconditionable { } // namespace gko -#endif // GKO_CORE_SOLVER_GMRES_HPP +#endif // GKO_CORE_SOLVER_GMRES_HPP_ diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index 72173c6e49d..c665356ec5c 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -52,27 +53,34 @@ namespace solver { /** * Iterative refinement (IR) is an iterative method that uses another coarse * method to approximate the error of the current solution via the current - * residual. + * residual. Moreover, it can be also considered as preconditioned Richardson + * iteration with relaxation factor = 1. * * For any approximation of the solution `solution` to the system `Ax = b`, the * residual is defined as: `residual = b - A solution`. The error in * `solution`, `e = x - solution` (with `x` being the exact solution) can be * obtained as the solution to the residual equation `Ae = residual`, since `A e * = Ax - A solution = b - A solution = residual`. Then, the real solution is - * computed as `x = solution + e`. Instead of accurately solving the residual - * equation `Ae = residual`, the solution of the system `e` can be approximated - * to obtain the approximation `error` using a coarse method `solver`, which is - * used to update `solution`, and the entire process is repeated with the - * updated `solution`. This yields the iterative refinement method: + * computed as `x = relaxation_factor * solution + e`. Instead of accurately + * solving the residual equation `Ae = residual`, the solution of the system `e` + * can be approximated to obtain the approximation `error` using a coarse method + * `solver`, which is used to update `solution`, and the entire process is + * repeated with the updated `solution`. This yields the iterative refinement + * method: * * ``` * solution = initial_guess * while not converged: * residual = b - A solution * error = solver(A, residual) - * solution = solution + error + * solution = solution + relaxation_factor * error * ``` * + * With `relaxation_factor` equal to 1 (default), the solver is Iterative + * Refinement, with `relaxation_factor` equal to a value other than `1`, the + * solver is a Richardson iteration, with possibility for additional + * preconditioning. + * * Assuming that `solver` has accuracy `c`, i.e., `| e - error | <= c | e |`, * iterative refinement will converge with a convergence rate of `c`. Indeed, * from `e - error = x - solution - error = x - solution*` (where `solution*` @@ -86,7 +94,8 @@ namespace solver { * solver. Such a setting results in a relaxation method known as the Richardson * iteration with parameter 1, which is guaranteed to converge for matrices * whose spectrum is strictly contained within the unit disc around 1 (i.e., all - * its eigenvalues `lambda` have to satisfy the equation `|lambda - 1| < 1). + * its eigenvalues `lambda` have to satisfy the equation `|relaxation_factor * + * lambda - 1| < 1). * * @tparam ValueType precision of matrix elements * @@ -94,12 +103,13 @@ namespace solver { * @ingroup LinOp */ template -class Ir : public EnableLinOp> { +class Ir : public EnableLinOp>, public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; public: using value_type = ValueType; + using transposed_type = Ir; /** * Returns the system operator (matrix) of the linear system. @@ -111,6 +121,16 @@ class Ir : public EnableLinOp> { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Return true as iterative solvers use the data in x as an initial guess. + * + * @return true as iterative solvers use the data in x as an initial guess. + */ + bool apply_uses_initial_guess() const override { return true; } /** * Returns the solver operator used as the inner solver. @@ -130,6 +150,28 @@ class Ir : public EnableLinOp> { solver_ = new_solver; } + /** + * Gets the stopping criterion factory of the solver. + * + * @return the stopping criterion factory + */ + std::shared_ptr get_stop_criterion_factory() + const + { + return stop_criterion_factory_; + } + + /** + * Sets the stopping criterion of the solver. + * + * @param other the new stopping criterion factory + */ + void set_stop_criterion_factory( + std::shared_ptr other) + { + stop_criterion_factory_ = std::move(other); + } + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -150,6 +192,11 @@ class Ir : public EnableLinOp> { */ std::shared_ptr GKO_FACTORY_PARAMETER(generated_solver, nullptr); + + /** + * Relaxation factor for Richardson iteration + */ + ValueType GKO_FACTORY_PARAMETER(relaxation_factor, value_type{1}); }; GKO_ENABLE_LIN_OP_FACTORY(Ir, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); @@ -167,7 +214,7 @@ class Ir : public EnableLinOp> { explicit Ir(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{std::move(system_matrix)} { @@ -180,6 +227,8 @@ class Ir : public EnableLinOp> { solver_ = matrix::Identity::create(this->get_executor(), this->get_size()[0]); } + relaxation_factor_ = gko::initialize>( + {parameters_.relaxation_factor}, this->get_executor()); stop_criterion_factory_ = stop::combine(std::move(parameters_.criteria)); } @@ -188,9 +237,14 @@ class Ir : public EnableLinOp> { std::shared_ptr system_matrix_{}; std::shared_ptr solver_{}; std::shared_ptr stop_criterion_factory_{}; + std::shared_ptr> relaxation_factor_{}; }; +template +using Richardson = Ir; + + } // namespace solver } // namespace gko diff --git a/include/ginkgo/core/solver/lower_trs.hpp b/include/ginkgo/core/solver/lower_trs.hpp index 481063a4f51..409a2cd1583 100644 --- a/include/ginkgo/core/solver/lower_trs.hpp +++ b/include/ginkgo/core/solver/lower_trs.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -58,6 +58,10 @@ namespace solver { struct SolveStruct; +template +class UpperTrs; + + /** * LowerTrs is the triangular solver which solves the system L x = b, when L is * a lower triangular matrix. It works best when passing in a matrix in CSR @@ -76,13 +80,16 @@ struct SolveStruct; * @ingroup LinOp */ template -class LowerTrs : public EnableLinOp> { +class LowerTrs : public EnableLinOp>, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; + friend class UpperTrs; public: using value_type = ValueType; using index_type = IndexType; + using transposed_type = UpperTrs; /** * Gets the system operator (CSR matrix) of the linear system. @@ -95,6 +102,10 @@ class LowerTrs : public EnableLinOp> { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -138,7 +149,7 @@ class LowerTrs : public EnableLinOp> { explicit LowerTrs(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{} { @@ -168,4 +179,4 @@ class LowerTrs : public EnableLinOp> { } // namespace gko -#endif // GKO_CORE_SOLVER_LOWER_TRS_HPP +#endif // GKO_CORE_SOLVER_LOWER_TRS_HPP_ diff --git a/include/ginkgo/core/solver/upper_trs.hpp b/include/ginkgo/core/solver/upper_trs.hpp index 6f23cd4d1fd..512467919a4 100644 --- a/include/ginkgo/core/solver/upper_trs.hpp +++ b/include/ginkgo/core/solver/upper_trs.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -58,6 +58,10 @@ namespace solver { struct SolveStruct; +template +class LowerTrs; + + /** * UpperTrs is the triangular solver which solves the system U x = b, when U is * an upper triangular matrix. It works best when passing in a matrix in CSR @@ -76,13 +80,16 @@ struct SolveStruct; * @ingroup LinOp */ template -class UpperTrs : public EnableLinOp> { +class UpperTrs : public EnableLinOp>, + public Transposable { friend class EnableLinOp; friend class EnablePolymorphicObject; + friend class LowerTrs; public: using value_type = ValueType; using index_type = IndexType; + using transposed_type = LowerTrs; /** * Gets the system operator (CSR matrix) of the linear system. @@ -95,6 +102,10 @@ class UpperTrs : public EnableLinOp> { return system_matrix_; } + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** @@ -132,7 +143,7 @@ class UpperTrs : public EnableLinOp> { explicit UpperTrs(const Factory *factory, std::shared_ptr system_matrix) : EnableLinOp(factory->get_executor(), - transpose(system_matrix->get_size())), + gko::transpose(system_matrix->get_size())), parameters_{factory->get_parameters()}, system_matrix_{} { @@ -162,4 +173,4 @@ class UpperTrs : public EnableLinOp> { } // namespace gko -#endif // GKO_CORE_SOLVER_UPPER_TRS_HPP +#endif // GKO_CORE_SOLVER_UPPER_TRS_HPP_ diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp index cc0e88a36be..d5d88a978f6 100644 --- a/include/ginkgo/core/stop/combined.hpp +++ b/include/ginkgo/core/stop/combined.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_STOP_COMBINED_HPP_ -#include +#include -#include +#include namespace gko { @@ -87,7 +87,14 @@ class Combined : public EnablePolymorphicObject { parameters_{factory->get_parameters()} { for (const auto &f : parameters_.criteria) { - criteria_.push_back(f->generate(args)); + // Ignore the nullptr from the list + if (f != nullptr) { + criteria_.push_back(f->generate(args)); + } + } + // If the list are empty or all nullptr, throw gko::NotSupported + if (criteria_.size() == 0) { + GKO_NOT_SUPPORTED(this); } } @@ -120,12 +127,21 @@ std::shared_ptr combine(FactoryContainer &&factories) GKO_NOT_SUPPORTED(nullptr); return nullptr; case 1: + if (factories[0] == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } return factories[0]; default: - auto exec = factories[0]->get_executor(); - return Combined::build() - .with_criteria(std::forward(factories)) - .on(exec); + if (factories[0] == nullptr) { + // first factory must be valid to capture executor + GKO_NOT_SUPPORTED(nullptr); + return nullptr; + } else { + auto exec = factories[0]->get_executor(); + return Combined::build() + .with_criteria(std::forward(factories)) + .on(exec); + } } } diff --git a/include/ginkgo/core/stop/criterion.hpp b/include/ginkgo/core/stop/criterion.hpp index f619137387e..35c28aefdd9 100644 --- a/include/ginkgo/core/stop/criterion.hpp +++ b/include/ginkgo/core/stop/criterion.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -275,6 +276,12 @@ using EnableDefaultCriterionFactory = * `get_<_parameters_name>()`) * @param _factory_name name of the generated factory type * + * @internal For some abstract reason, nvcc compilation through HIP does not + * properly take into account the `using` declaration to inherit + * constructors. In addition, the default initialization `{}` for + * `_parameters_name##type parameters` also does not work, which + * means the current form is probably the only correct one. + * * @ingroup stop */ #define GKO_ENABLE_CRITERION_FACTORY(_criterion, _parameters_name, \ diff --git a/include/ginkgo/core/stop/iteration.hpp b/include/ginkgo/core/stop/iteration.hpp index 8c17274c9de..e2fa08a60e3 100644 --- a/include/ginkgo/core/stop/iteration.hpp +++ b/include/ginkgo/core/stop/iteration.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp new file mode 100644 index 00000000000..03052618cc5 --- /dev/null +++ b/include/ginkgo/core/stop/residual_norm.hpp @@ -0,0 +1,278 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_STOP_RESIDUAL_NORM_HPP_ +#define GKO_CORE_STOP_RESIDUAL_NORM_HPP_ + + +#include + + +#include +#include +#include +#include +#include + + +namespace gko { +namespace stop { + + +/** + * The ResidualNorm class provides a framework for stopping criteria + * related to the residual norm. These criteria differ in the way they + * initialize starting_tau_, so in the value they compare the + * residual norm against. + * + * @ingroup stop + */ +template +class ResidualNorm + : public EnablePolymorphicObject, Criterion> { + friend class EnablePolymorphicObject, Criterion>; + +public: + using NormVector = matrix::Dense>; + using Vector = matrix::Dense; + +protected: + bool check_impl(uint8 stoppingId, bool setFinalized, + Array *stop_status, bool *one_changed, + const Criterion::Updater &) override; + + explicit ResidualNorm(std::shared_ptr exec) + : EnablePolymorphicObject(exec), + device_storage_{exec, 2} + {} + + explicit ResidualNorm(std::shared_ptr exec, + remove_complex tolerance) + : EnablePolymorphicObject(exec), + device_storage_{exec, 2}, + tolerance_{tolerance} + {} + + std::unique_ptr starting_tau_{}; + std::unique_ptr u_dense_tau_{}; + +private: + remove_complex tolerance_{}; + /* Contains device side: all_converged and one_changed booleans */ + Array device_storage_; +}; + + +/** + * The ResidualNormReduction class is a stopping criterion which stops the + * iteration process when the residual norm is below a certain + * threshold relative to the norm of the initial residual, i.e. when + * norm(residual) / norm(initial_residual) < threshold. + * For better performance, the checks are run thanks to kernels on + * the executor where the algorithm is executed. + * + * @note To use this stopping criterion there are some dependencies. The + * constructor depends on `initial_residual` in order to compute the first + * relative residual norm. The check method depends on either the + * `residual_norm` or the `residual` being set. When any of those is not + * correctly provided, an exception ::gko::NotSupported() is thrown. + * + * @ingroup stop + */ +template +class ResidualNormReduction : public ResidualNorm { +public: + using NormVector = matrix::Dense>; + using Vector = matrix::Dense; + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * Factor by which the residual norm will be reduced + */ + remove_complex GKO_FACTORY_PARAMETER(reduction_factor, + 1e-15); + }; + GKO_ENABLE_CRITERION_FACTORY(ResidualNormReduction, parameters, + Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + explicit ResidualNormReduction(std::shared_ptr exec) + : ResidualNorm(exec) + {} + + explicit ResidualNormReduction(const Factory *factory, + const CriterionArgs &args) + : ResidualNorm(factory->get_executor(), + factory->get_parameters().reduction_factor), + parameters_{factory->get_parameters()} + { + if (args.initial_residual == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } + + auto exec = factory->get_executor(); + + auto dense_r = as(args.initial_residual); + this->starting_tau_ = NormVector::create( + exec, dim<2>{1, args.initial_residual->get_size()[1]}); + this->u_dense_tau_ = + NormVector::create_with_config_of(this->starting_tau_.get()); + dense_r->compute_norm2(this->starting_tau_.get()); + } +}; + + +/** + * The RelativeResidualNorm class is a stopping criterion which stops the + * iteration process when the residual norm is below a certain + * threshold relative to the norm of the right-hand side, i.e. when + * norm(residual) / norm(right_hand_side) < threshold. + * For better performance, the checks are run thanks to kernels on + * the executor where the algorithm is executed. + * + * @note To use this stopping criterion there are some dependencies. The + * constructor depends on `b` in order to compute the norm of the + * right-hand side. If this is not correctly provided, an exception + * ::gko::NotSupported() is thrown. + * + * @ingroup stop + */ +template +class RelativeResidualNorm : public ResidualNorm { +public: + using NormVector = matrix::Dense>; + using Vector = matrix::Dense; + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * Relative residual norm goal + */ + remove_complex GKO_FACTORY_PARAMETER(tolerance, 1e-15); + }; + GKO_ENABLE_CRITERION_FACTORY(RelativeResidualNorm, parameters, + Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + explicit RelativeResidualNorm(std::shared_ptr exec) + : ResidualNorm(exec) + {} + + explicit RelativeResidualNorm(const Factory *factory, + const CriterionArgs &args) + : ResidualNorm(factory->get_executor(), + factory->get_parameters().tolerance), + parameters_{factory->get_parameters()} + { + if (args.b == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } + + auto exec = factory->get_executor(); + + auto dense_rhs = as(args.b); + this->starting_tau_ = + NormVector::create(exec, dim<2>{1, args.b->get_size()[1]}); + this->u_dense_tau_ = + NormVector::create_with_config_of(this->starting_tau_.get()); + dense_rhs->compute_norm2(this->starting_tau_.get()); + } +}; + + +/** + * The AbsoluteResidualNorm class is a stopping criterion which stops the + * iteration process when the residual norm is below a certain + * threshold, i.e. when norm(residual) / threshold. + * For better performance, the checks are run thanks to kernels on + * the executor where the algorithm is executed. + * + * @note To use this stopping criterion there are some dependencies. The + * constructor depends on `b` in order to get the number of right-hand sides. + * If this is not correctly provided, an exception ::gko::NotSupported() + * is thrown. + * + * @ingroup stop + */ +template +class AbsoluteResidualNorm : public ResidualNorm { +public: + using NormVector = matrix::Dense>; + using Vector = matrix::Dense; + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + /** + * Absolute residual norm goal + */ + remove_complex GKO_FACTORY_PARAMETER(tolerance, 1e-15); + }; + GKO_ENABLE_CRITERION_FACTORY(AbsoluteResidualNorm, parameters, + Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + +protected: + void initialize_starting_tau(); + + explicit AbsoluteResidualNorm(std::shared_ptr exec) + : ResidualNorm(exec) + {} + + explicit AbsoluteResidualNorm(const Factory *factory, + const CriterionArgs &args) + : ResidualNorm(factory->get_executor(), + factory->get_parameters().tolerance), + parameters_{factory->get_parameters()} + { + if (args.b == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } + + auto exec = factory->get_executor(); + + this->starting_tau_ = + NormVector::create(exec, dim<2>{1, args.b->get_size()[1]}); + this->u_dense_tau_ = + NormVector::create_with_config_of(this->starting_tau_.get()); + initialize_starting_tau(); + } +}; + + +} // namespace stop +} // namespace gko + + +#endif // GKO_CORE_STOP_RESIDUAL_NORM_HPP_ diff --git a/include/ginkgo/core/stop/residual_norm_reduction.hpp b/include/ginkgo/core/stop/residual_norm_reduction.hpp index 4ae3392021b..6872b7be5c2 100644 --- a/include/ginkgo/core/stop/residual_norm_reduction.hpp +++ b/include/ginkgo/core/stop/residual_norm_reduction.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,94 +34,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_HPP_ -#include -#include -#include -#include - - -#include - - -namespace gko { -namespace stop { - -/** - * The ResidualNormReduction class is a stopping criterion which stops the - * iteration process when the relative residual norm is below a certain - * threshold. For better performance, the checks are run thanks to kernels on - * the executor where the algorithm is executed. - * - * @note To use this stopping criterion there are some dependencies. The - * constructor depends on `initial_residual` in order to compute the first - * relative residual norm. The check method depends on either the - * `residual_norm` or the `residual` being set. When any of those is not - * correctly provided, an exception ::gko::NotSupported() is thrown. - * - * @ingroup stop - */ -template -class ResidualNormReduction - : public EnablePolymorphicObject, - Criterion> { - friend class EnablePolymorphicObject, - Criterion>; - -public: - using Vector = matrix::Dense; - - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Relative residual norm goal - */ - remove_complex GKO_FACTORY_PARAMETER(reduction_factor, - 1e-15); - }; - GKO_ENABLE_CRITERION_FACTORY(ResidualNormReduction, parameters, - Factory); - GKO_ENABLE_BUILD_METHOD(Factory); - -protected: - bool check_impl(uint8 stoppingId, bool setFinalized, - Array *stop_status, bool *one_changed, - const Criterion::Updater &) override; - - explicit ResidualNormReduction(std::shared_ptr exec) - : EnablePolymorphicObject(exec), - device_storage_{exec, 2} - {} - - explicit ResidualNormReduction(const Factory *factory, - const CriterionArgs &args) - : EnablePolymorphicObject( - factory->get_executor()), - parameters_{factory->get_parameters()}, - device_storage_{factory->get_executor(), 2} - { - if (args.initial_residual == nullptr) { - GKO_NOT_SUPPORTED(nullptr); - } - - auto exec = factory->get_executor(); - - auto dense_r = as(args.initial_residual); - starting_tau_ = Vector::create( - exec, dim<2>{1, args.initial_residual->get_size()[1]}); - u_dense_tau_ = Vector::create_with_config_of(starting_tau_.get()); - dense_r->compute_norm2(starting_tau_.get()); - } - -private: - std::unique_ptr starting_tau_{}; - std::unique_ptr u_dense_tau_{}; - /* Contains device side: all_converged and one_changed booleans */ - Array device_storage_; -}; - - -} // namespace stop -} // namespace gko +#ifdef __GNUC__ +#pragma message \ + "This file is deprecated and will be removed in a later major release." +#elif defined(_MSC_VER) +#pragma message WARN( \ + "This file is deprecated and will be removed in a later major release.") +#endif +#include #endif // GKO_CORE_STOP_RESIDUAL_NORM_REDUCTION_HPP_ diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp index 6a74d0f7d8f..679d78be4de 100644 --- a/include/ginkgo/core/stop/stopping_status.hpp +++ b/include/ginkgo/core/stop/stopping_status.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp index a63bf576112..ef8f52fe7d8 100644 --- a/include/ginkgo/core/stop/time.hpp +++ b/include/ginkgo/core/stop/time.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_STOP_TIME_HPP_ -#include +#include -#include +#include namespace gko { @@ -58,10 +58,10 @@ class Time : public EnablePolymorphicObject { GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) { /** - * Amount of seconds to wait + * Amount of seconds to wait (default value: 10 seconds) */ - std::chrono::nanoseconds GKO_FACTORY_PARAMETER( - time_limit, std::chrono::seconds(10)); + std::chrono::nanoseconds GKO_FACTORY_PARAMETER(time_limit, + 10000000000LL); }; GKO_ENABLE_CRITERION_FACTORY(Time, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); @@ -72,7 +72,9 @@ class Time : public EnablePolymorphicObject { const Updater &) override; explicit Time(std::shared_ptr exec) - : EnablePolymorphicObject(std::move(exec)) + : EnablePolymorphicObject(std::move(exec)), + time_limit_{}, + start_{} {} explicit Time(const Factory *factory, const CriterionArgs args) @@ -89,8 +91,8 @@ class Time : public EnablePolymorphicObject { * parameters and here properly convert the double to a * std::chrono::duration type */ - std::chrono::duration time_limit_{}; - clock::time_point start_{}; + std::chrono::duration time_limit_; + clock::time_point start_; }; diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index 075ddc92806..7fccad5a29d 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_SYNTHESIZER_CONTAINERS_ -#define GKO_CORE_SYNTHESIZER_CONTAINERS_ +#ifndef GKO_CORE_SYNTHESIZER_CONTAINERS_HPP_ +#define GKO_CORE_SYNTHESIZER_CONTAINERS_HPP_ + + +#include namespace gko { @@ -115,4 +118,4 @@ using as_list = typename detail::as_list_impl::type; } // namespace gko -#endif // GKO_CORE_SYNTHESIZER_CONTAINERS_ +#endif // GKO_CORE_SYNTHESIZER_CONTAINERS_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 95a7dc3c734..1866412706d 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,7 +59,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include +#include #include +#include #include #include @@ -73,12 +76,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include +#include #include +#include #include #include #include @@ -91,7 +97,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include #include diff --git a/include/ginkgo/ginkgo.hpp.in b/include/ginkgo/ginkgo.hpp.in index 6dbaafef794..d3e83f82ccd 100644 --- a/include/ginkgo/ginkgo.hpp.in +++ b/include/ginkgo/ginkgo.hpp.in @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/matrices/CMakeLists.txt b/matrices/CMakeLists.txt index 6c368edfa95..ffe3602f83a 100644 --- a/matrices/CMakeLists.txt +++ b/matrices/CMakeLists.txt @@ -3,3 +3,13 @@ configure_file("${Ginkgo_SOURCE_DIR}/matrices/config.hpp.in" configure_file("test/ani1.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/ani1.mtx") configure_file("test/ani4.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/ani4.mtx") +configure_file("test/isai_l.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l.mtx") +configure_file("test/isai_l_excess.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_excess.mtx") +configure_file("test/isai_l_excess_rhs.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_excess_rhs.mtx") +configure_file("test/isai_l_inv.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_inv.mtx") +configure_file("test/isai_l_inv_partial.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_l_inv_partial.mtx") +configure_file("test/isai_u.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u.mtx") +configure_file("test/isai_u_excess.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_excess.mtx") +configure_file("test/isai_u_excess_rhs.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_excess_rhs.mtx") +configure_file("test/isai_u_inv.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_inv.mtx") +configure_file("test/isai_u_inv_partial.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_u_inv_partial.mtx") diff --git a/matrices/config.hpp.in b/matrices/config.hpp.in index 3920a6d1788..93cd1adb874 100644 --- a/matrices/config.hpp.in +++ b/matrices/config.hpp.in @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,12 +38,13 @@ namespace gko { namespace matrices { -const char *location_ani1_mtx = "@Ginkgo_SOURCE_DIR@/matrices/test/ani1.mtx"; -const char *location_ani4_mtx = "@Ginkgo_SOURCE_DIR@/matrices/test/ani4.mtx"; +const char *location_ani1_mtx = "@Ginkgo_BINARY_DIR@/matrices/test/ani1.mtx"; +const char *location_ani4_mtx = "@Ginkgo_BINARY_DIR@/matrices/test/ani4.mtx"; +const char *location_isai_mtxs = "@Ginkgo_BINARY_DIR@/matrices/test/"; -} // namespace matrices -} // namespace gko +} // namespace matrices +} // namespace gko -#endif // GKO_MATRICES_CONFIG_HPP_ +#endif // GKO_MATRICES_CONFIG_HPP_ diff --git a/matrices/test/isai_l.mtx b/matrices/test/isai_l.mtx new file mode 100644 index 00000000000..01e7dd3eeab --- /dev/null +++ b/matrices/test/isai_l.mtx @@ -0,0 +1,162 @@ +%%MatrixMarket matrix coordinate real general +35 35 160 +1 1 1 +2 1 -1 +2 2 1 +3 2 -1 +3 3 2 +4 3 2 +4 4 1 +5 3 2 +5 4 -1 +5 5 1 +6 3 2 +6 5 -1 +6 6 1 +7 3 2 +7 6 -1 +7 7 1 +8 3 2 +8 7 -1 +8 8 1 +9 3 2 +9 8 -1 +9 9 1 +10 3 2 +10 9 -1 +10 10 1 +11 3 2 +11 10 -1 +11 11 1 +12 3 2 +12 11 -1 +12 12 1 +13 3 2 +13 12 -1 +13 13 1 +14 3 2 +14 13 -1 +14 14 1 +15 3 2 +15 14 -1 +15 15 1 +16 3 2 +16 15 -1 +16 16 1 +17 3 2 +17 16 -1 +17 17 1 +18 3 2 +18 17 -1 +18 18 1 +19 3 2 +19 18 -1 +19 19 1 +20 3 2 +20 19 -1 +20 20 1 +21 3 2 +21 20 -1 +21 21 1 +22 3 2 +22 21 -1 +22 22 1 +23 3 2 +23 22 -1 +23 23 1 +24 3 2 +24 23 -1 +24 24 1 +25 3 2 +25 24 -1 +25 25 1 +26 3 2 +26 25 -1 +26 26 1 +27 3 2 +27 26 -1 +27 27 1 +28 3 2 +28 27 -1 +28 28 1 +29 3 2 +29 28 -1 +29 29 1 +30 3 2 +30 29 -1 +30 30 1 +31 3 2 +31 30 -1 +31 31 1 +32 3 2 +32 31 -1 +32 32 1 +33 1 1 +33 2 1 +33 3 3 +33 4 1 +33 5 1 +33 6 1 +33 7 1 +33 8 1 +33 9 1 +33 10 1 +33 11 1 +33 12 1 +33 13 1 +33 14 1 +33 15 1 +33 16 1 +33 17 1 +33 18 1 +33 19 1 +33 20 1 +33 21 1 +33 22 1 +33 23 1 +33 24 1 +33 25 1 +33 26 1 +33 27 1 +33 28 1 +33 29 1 +33 30 1 +33 31 1 +33 32 1 +33 33 1 +34 3 2 +34 33 -1 +34 34 1 +35 2 12345 +35 3 12345 +35 4 12345 +35 5 12345 +35 6 12345 +35 7 12345 +35 8 12345 +35 9 12345 +35 10 12345 +35 11 12345 +35 12 12345 +35 13 12345 +35 14 12345 +35 15 12345 +35 16 12345 +35 17 12345 +35 18 12345 +35 19 12345 +35 20 12345 +35 21 12345 +35 22 12345 +35 23 12345 +35 24 12345 +35 25 12345 +35 26 12345 +35 27 12345 +35 28 12345 +35 29 12345 +35 30 12345 +35 31 12345 +35 32 12345 +35 34 12345 +35 35 1 diff --git a/matrices/test/isai_l_excess.mtx b/matrices/test/isai_l_excess.mtx new file mode 100644 index 00000000000..da41a6c58e1 --- /dev/null +++ b/matrices/test/isai_l_excess.mtx @@ -0,0 +1,250 @@ +%%MatrixMarket matrix coordinate real general +66 66 248 +1 1 1 +2 1 -1 +2 2 1 +3 2 -1 +3 3 2 +4 3 2 +4 4 1 +5 3 2 +5 4 -1 +5 5 1 +6 3 2 +6 5 -1 +6 6 1 +7 3 2 +7 6 -1 +7 7 1 +8 3 2 +8 7 -1 +8 8 1 +9 3 2 +9 8 -1 +9 9 1 +10 3 2 +10 9 -1 +10 10 1 +11 3 2 +11 10 -1 +11 11 1 +12 3 2 +12 11 -1 +12 12 1 +13 3 2 +13 12 -1 +13 13 1 +14 3 2 +14 13 -1 +14 14 1 +15 3 2 +15 14 -1 +15 15 1 +16 3 2 +16 15 -1 +16 16 1 +17 3 2 +17 16 -1 +17 17 1 +18 3 2 +18 17 -1 +18 18 1 +19 3 2 +19 18 -1 +19 19 1 +20 3 2 +20 19 -1 +20 20 1 +21 3 2 +21 20 -1 +21 21 1 +22 3 2 +22 21 -1 +22 22 1 +23 3 2 +23 22 -1 +23 23 1 +24 3 2 +24 23 -1 +24 24 1 +25 3 2 +25 24 -1 +25 25 1 +26 3 2 +26 25 -1 +26 26 1 +27 3 2 +27 26 -1 +27 27 1 +28 3 2 +28 27 -1 +28 28 1 +29 3 2 +29 28 -1 +29 29 1 +30 3 2 +30 29 -1 +30 30 1 +31 3 2 +31 30 -1 +31 31 1 +32 3 2 +32 31 -1 +32 32 1 +33 1 1 +33 2 1 +33 3 3 +33 4 1 +33 5 1 +33 6 1 +33 7 1 +33 8 1 +33 9 1 +33 10 1 +33 11 1 +33 12 1 +33 13 1 +33 14 1 +33 15 1 +33 16 1 +33 17 1 +33 18 1 +33 19 1 +33 20 1 +33 21 1 +33 22 1 +33 23 1 +33 24 1 +33 25 1 +33 26 1 +33 27 1 +33 28 1 +33 29 1 +33 30 1 +33 31 1 +33 32 1 +33 33 1 +34 34 1 +35 34 -1 +35 35 2 +36 35 2 +36 36 1 +37 35 2 +37 36 -1 +37 37 1 +38 35 2 +38 37 -1 +38 38 1 +39 35 2 +39 38 -1 +39 39 1 +40 35 2 +40 39 -1 +40 40 1 +41 35 2 +41 40 -1 +41 41 1 +42 35 2 +42 41 -1 +42 42 1 +43 35 2 +43 42 -1 +43 43 1 +44 35 2 +44 43 -1 +44 44 1 +45 35 2 +45 44 -1 +45 45 1 +46 35 2 +46 45 -1 +46 46 1 +47 35 2 +47 46 -1 +47 47 1 +48 35 2 +48 47 -1 +48 48 1 +49 35 2 +49 48 -1 +49 49 1 +50 35 2 +50 49 -1 +50 50 1 +51 35 2 +51 50 -1 +51 51 1 +52 35 2 +52 51 -1 +52 52 1 +53 35 2 +53 52 -1 +53 53 1 +54 35 2 +54 53 -1 +54 54 1 +55 35 2 +55 54 -1 +55 55 1 +56 35 2 +56 55 -1 +56 56 1 +57 35 2 +57 56 -1 +57 57 1 +58 35 2 +58 57 -1 +58 58 1 +59 35 2 +59 58 -1 +59 59 1 +60 35 2 +60 59 -1 +60 60 1 +61 35 2 +61 60 -1 +61 61 1 +62 35 2 +62 61 -1 +62 62 1 +63 35 2 +63 62 -1 +63 63 1 +64 35 2 +64 63 -1 +64 64 1 +65 35 2 +65 65 1 +66 34 12345 +66 35 12345 +66 36 12345 +66 37 12345 +66 38 12345 +66 39 12345 +66 40 12345 +66 41 12345 +66 42 12345 +66 43 12345 +66 44 12345 +66 45 12345 +66 46 12345 +66 47 12345 +66 48 12345 +66 49 12345 +66 50 12345 +66 51 12345 +66 52 12345 +66 53 12345 +66 54 12345 +66 55 12345 +66 56 12345 +66 57 12345 +66 58 12345 +66 59 12345 +66 60 12345 +66 61 12345 +66 62 12345 +66 63 12345 +66 64 12345 +66 65 12345 +66 66 1 diff --git a/matrices/test/isai_l_excess_rhs.mtx b/matrices/test/isai_l_excess_rhs.mtx new file mode 100644 index 00000000000..85972f682ab --- /dev/null +++ b/matrices/test/isai_l_excess_rhs.mtx @@ -0,0 +1,4 @@ +%%MatrixMarket matrix coordinate real general +66 1 2 +33 1 1 +66 1 1 diff --git a/matrices/test/isai_l_inv.mtx b/matrices/test/isai_l_inv.mtx new file mode 100644 index 00000000000..188eb3b9daf --- /dev/null +++ b/matrices/test/isai_l_inv.mtx @@ -0,0 +1,162 @@ +%%MatrixMarket matrix coordinate real general +35 35 160 +1 1 1 +2 1 1 +2 2 1 +3 2 0.5 +3 3 0.5 +4 3 -1 +4 4 1 +5 3 -2 +5 4 1 +5 5 1 +6 3 -2 +6 5 1 +6 6 1 +7 3 -2 +7 6 1 +7 7 1 +8 3 -2 +8 7 1 +8 8 1 +9 3 -2 +9 8 1 +9 9 1 +10 3 -2 +10 9 1 +10 10 1 +11 3 -2 +11 10 1 +11 11 1 +12 3 -2 +12 11 1 +12 12 1 +13 3 -2 +13 12 1 +13 13 1 +14 3 -2 +14 13 1 +14 14 1 +15 3 -2 +15 14 1 +15 15 1 +16 3 -2 +16 15 1 +16 16 1 +17 3 -2 +17 16 1 +17 17 1 +18 3 -2 +18 17 1 +18 18 1 +19 3 -2 +19 18 1 +19 19 1 +20 3 -2 +20 19 1 +20 20 1 +21 3 -2 +21 20 1 +21 21 1 +22 3 -2 +22 21 1 +22 22 1 +23 3 -2 +23 22 1 +23 23 1 +24 3 -2 +24 23 1 +24 24 1 +25 3 -2 +25 24 1 +25 25 1 +26 3 -2 +26 25 1 +26 26 1 +27 3 -2 +27 26 1 +27 27 1 +28 3 -2 +28 27 1 +28 28 1 +29 3 -2 +29 28 1 +29 29 1 +30 3 -2 +30 29 1 +30 30 1 +31 3 -2 +31 30 1 +31 31 1 +32 3 -2 +32 31 1 +32 32 1 +33 1 431.5 +33 2 432.5 +33 3 433.5 +33 4 -29 +33 5 -28 +33 6 -27 +33 7 -26 +33 8 -25 +33 9 -24 +33 10 -23 +33 11 -22 +33 12 -21 +33 13 -20 +33 14 -19 +33 15 -18 +33 16 -17 +33 17 -16 +33 18 -15 +33 19 -14 +33 20 -13 +33 21 -12 +33 22 -11 +33 23 -10 +33 24 -9 +33 25 -8 +33 26 -7 +33 27 -6 +33 28 -5 +33 29 -4 +33 30 -3 +33 31 -2 +33 32 -1 +33 33 1 +34 3 -2.5 +34 33 1 +34 34 1 +35 2 12345 +35 3 12345 +35 4 12345 +35 5 12345 +35 6 12345 +35 7 12345 +35 8 12345 +35 9 12345 +35 10 12345 +35 11 12345 +35 12 12345 +35 13 12345 +35 14 12345 +35 15 12345 +35 16 12345 +35 17 12345 +35 18 12345 +35 19 12345 +35 20 12345 +35 21 12345 +35 22 12345 +35 23 12345 +35 24 12345 +35 25 12345 +35 26 12345 +35 27 12345 +35 28 12345 +35 29 12345 +35 30 12345 +35 31 12345 +35 32 12345 +35 34 12345 +35 35 1 diff --git a/matrices/test/isai_l_inv_partial.mtx b/matrices/test/isai_l_inv_partial.mtx new file mode 100644 index 00000000000..a9fb591f7c8 --- /dev/null +++ b/matrices/test/isai_l_inv_partial.mtx @@ -0,0 +1,162 @@ +%%MatrixMarket matrix coordinate real general +35 35 160 +1 1 1 +2 1 1 +2 2 1 +3 2 0.5 +3 3 0.5 +4 3 -1 +4 4 1 +5 3 -2 +5 4 1 +5 5 1 +6 3 -2 +6 5 1 +6 6 1 +7 3 -2 +7 6 1 +7 7 1 +8 3 -2 +8 7 1 +8 8 1 +9 3 -2 +9 8 1 +9 9 1 +10 3 -2 +10 9 1 +10 10 1 +11 3 -2 +11 10 1 +11 11 1 +12 3 -2 +12 11 1 +12 12 1 +13 3 -2 +13 12 1 +13 13 1 +14 3 -2 +14 13 1 +14 14 1 +15 3 -2 +15 14 1 +15 15 1 +16 3 -2 +16 15 1 +16 16 1 +17 3 -2 +17 16 1 +17 17 1 +18 3 -2 +18 17 1 +18 18 1 +19 3 -2 +19 18 1 +19 19 1 +20 3 -2 +20 19 1 +20 20 1 +21 3 -2 +21 20 1 +21 21 1 +22 3 -2 +22 21 1 +22 22 1 +23 3 -2 +23 22 1 +23 23 1 +24 3 -2 +24 23 1 +24 24 1 +25 3 -2 +25 24 1 +25 25 1 +26 3 -2 +26 25 1 +26 26 1 +27 3 -2 +27 26 1 +27 27 1 +28 3 -2 +28 27 1 +28 28 1 +29 3 -2 +29 28 1 +29 29 1 +30 3 -2 +30 29 1 +30 30 1 +31 3 -2 +31 30 1 +31 31 1 +32 3 -2 +32 31 1 +32 32 1 +33 1 -1 +33 2 -1 +33 3 -1 +33 4 -1 +33 5 -1 +33 6 -1 +33 7 -1 +33 8 -1 +33 9 -1 +33 10 -1 +33 11 -1 +33 12 -1 +33 13 -1 +33 14 -1 +33 15 -1 +33 16 -1 +33 17 -1 +33 18 -1 +33 19 -1 +33 20 -1 +33 21 -1 +33 22 -1 +33 23 -1 +33 24 -1 +33 25 -1 +33 26 -1 +33 27 -1 +33 28 -1 +33 29 -1 +33 30 -1 +33 31 -1 +33 32 -1 +33 33 -1 +34 3 -2.5 +34 33 1 +34 34 1 +35 2 -1 +35 3 -1 +35 4 -1 +35 5 -1 +35 6 -1 +35 7 -1 +35 8 -1 +35 9 -1 +35 10 -1 +35 11 -1 +35 12 -1 +35 13 -1 +35 14 -1 +35 15 -1 +35 16 -1 +35 17 -1 +35 18 -1 +35 19 -1 +35 20 -1 +35 21 -1 +35 22 -1 +35 23 -1 +35 24 -1 +35 25 -1 +35 26 -1 +35 27 -1 +35 28 -1 +35 29 -1 +35 30 -1 +35 31 -1 +35 32 -1 +35 34 -1 +35 35 -1 diff --git a/matrices/test/isai_u.mtx b/matrices/test/isai_u.mtx new file mode 100644 index 00000000000..6fb357a718f --- /dev/null +++ b/matrices/test/isai_u.mtx @@ -0,0 +1,162 @@ +%%MatrixMarket matrix coordinate real general +35 35 160 +1 1 1 +1 2 -1 +1 33 1 +2 2 1 +2 3 -1 +2 33 1 +2 35 12345 +3 3 2 +3 4 2 +3 5 2 +3 6 2 +3 7 2 +3 8 2 +3 9 2 +3 10 2 +3 11 2 +3 12 2 +3 13 2 +3 14 2 +3 15 2 +3 16 2 +3 17 2 +3 18 2 +3 19 2 +3 20 2 +3 21 2 +3 22 2 +3 23 2 +3 24 2 +3 25 2 +3 26 2 +3 27 2 +3 28 2 +3 29 2 +3 30 2 +3 31 2 +3 32 2 +3 33 3 +3 34 2 +3 35 12345 +4 4 1 +4 5 -1 +4 33 1 +4 35 12345 +5 5 1 +5 6 -1 +5 33 1 +5 35 12345 +6 6 1 +6 7 -1 +6 33 1 +6 35 12345 +7 7 1 +7 8 -1 +7 33 1 +7 35 12345 +8 8 1 +8 9 -1 +8 33 1 +8 35 12345 +9 9 1 +9 10 -1 +9 33 1 +9 35 12345 +10 10 1 +10 11 -1 +10 33 1 +10 35 12345 +11 11 1 +11 12 -1 +11 33 1 +11 35 12345 +12 12 1 +12 13 -1 +12 33 1 +12 35 12345 +13 13 1 +13 14 -1 +13 33 1 +13 35 12345 +14 14 1 +14 15 -1 +14 33 1 +14 35 12345 +15 15 1 +15 16 -1 +15 33 1 +15 35 12345 +16 16 1 +16 17 -1 +16 33 1 +16 35 12345 +17 17 1 +17 18 -1 +17 33 1 +17 35 12345 +18 18 1 +18 19 -1 +18 33 1 +18 35 12345 +19 19 1 +19 20 -1 +19 33 1 +19 35 12345 +20 20 1 +20 21 -1 +20 33 1 +20 35 12345 +21 21 1 +21 22 -1 +21 33 1 +21 35 12345 +22 22 1 +22 23 -1 +22 33 1 +22 35 12345 +23 23 1 +23 24 -1 +23 33 1 +23 35 12345 +24 24 1 +24 25 -1 +24 33 1 +24 35 12345 +25 25 1 +25 26 -1 +25 33 1 +25 35 12345 +26 26 1 +26 27 -1 +26 33 1 +26 35 12345 +27 27 1 +27 28 -1 +27 33 1 +27 35 12345 +28 28 1 +28 29 -1 +28 33 1 +28 35 12345 +29 29 1 +29 30 -1 +29 33 1 +29 35 12345 +30 30 1 +30 31 -1 +30 33 1 +30 35 12345 +31 31 1 +31 32 -1 +31 33 1 +31 35 12345 +32 32 1 +32 33 1 +32 35 12345 +33 33 1 +33 34 -1 +34 34 1 +34 35 12345 +35 35 1 diff --git a/matrices/test/isai_u_excess.mtx b/matrices/test/isai_u_excess.mtx new file mode 100644 index 00000000000..0ef6a921e03 --- /dev/null +++ b/matrices/test/isai_u_excess.mtx @@ -0,0 +1,155 @@ +%%MatrixMarket matrix coordinate real general +33 33 153 +1 1 2 +1 2 2 +1 3 2 +1 4 2 +1 5 2 +1 6 2 +1 7 2 +1 8 2 +1 9 2 +1 10 2 +1 11 2 +1 12 2 +1 13 2 +1 14 2 +1 15 2 +1 16 2 +1 17 2 +1 18 2 +1 19 2 +1 20 2 +1 21 2 +1 22 2 +1 23 2 +1 24 2 +1 25 2 +1 26 2 +1 27 2 +1 28 2 +1 29 2 +1 30 2 +1 31 3 +1 32 2 +1 33 12345 +2 2 1 +2 3 -1 +2 31 1 +2 33 12345 +3 3 1 +3 4 -1 +3 31 1 +3 33 12345 +4 4 1 +4 5 -1 +4 31 1 +4 33 12345 +5 5 1 +5 6 -1 +5 31 1 +5 33 12345 +6 6 1 +6 7 -1 +6 31 1 +6 33 12345 +7 7 1 +7 8 -1 +7 31 1 +7 33 12345 +8 8 1 +8 9 -1 +8 31 1 +8 33 12345 +9 9 1 +9 10 -1 +9 31 1 +9 33 12345 +10 10 1 +10 11 -1 +10 31 1 +10 33 12345 +11 11 1 +11 12 -1 +11 31 1 +11 33 12345 +12 12 1 +12 13 -1 +12 31 1 +12 33 12345 +13 13 1 +13 14 -1 +13 31 1 +13 33 12345 +14 14 1 +14 15 -1 +14 31 1 +14 33 12345 +15 15 1 +15 16 -1 +15 31 1 +15 33 12345 +16 16 1 +16 17 -1 +16 31 1 +16 33 12345 +17 17 1 +17 18 -1 +17 31 1 +17 33 12345 +18 18 1 +18 19 -1 +18 31 1 +18 33 12345 +19 19 1 +19 20 -1 +19 31 1 +19 33 12345 +20 20 1 +20 21 -1 +20 31 1 +20 33 12345 +21 21 1 +21 22 -1 +21 31 1 +21 33 12345 +22 22 1 +22 23 -1 +22 31 1 +22 33 12345 +23 23 1 +23 24 -1 +23 31 1 +23 33 12345 +24 24 1 +24 25 -1 +24 31 1 +24 33 12345 +25 25 1 +25 26 -1 +25 31 1 +25 33 12345 +26 26 1 +26 27 -1 +26 31 1 +26 33 12345 +27 27 1 +27 28 -1 +27 31 1 +27 33 12345 +28 28 1 +28 29 -1 +28 31 1 +28 33 12345 +29 29 1 +29 30 -1 +29 31 1 +29 33 12345 +30 30 1 +30 31 1 +30 33 12345 +31 31 1 +31 32 -1 +32 32 1 +32 33 12345 +33 33 1 diff --git a/matrices/test/isai_u_excess_rhs.mtx b/matrices/test/isai_u_excess_rhs.mtx new file mode 100644 index 00000000000..2218b9de2e7 --- /dev/null +++ b/matrices/test/isai_u_excess_rhs.mtx @@ -0,0 +1,3 @@ +%%MatrixMarket matrix coordinate real general +33 1 1 +1 1 1 diff --git a/matrices/test/isai_u_inv.mtx b/matrices/test/isai_u_inv.mtx new file mode 100644 index 00000000000..48aef7daae8 --- /dev/null +++ b/matrices/test/isai_u_inv.mtx @@ -0,0 +1,162 @@ +%%MatrixMarket matrix coordinate real general +35 35 160 +1 1 1 +1 2 1 +1 33 -2 +2 2 1 +2 3 0.5 +2 33 -2.5 +2 35 12345 +3 3 0.5 +3 4 -1 +3 5 -2 +3 6 -3 +3 7 -4 +3 8 -5 +3 9 -6 +3 10 -7 +3 11 -8 +3 12 -9 +3 13 -10 +3 14 -11 +3 15 -12 +3 16 -13 +3 17 -14 +3 18 -15 +3 19 -16 +3 20 -17 +3 21 -18 +3 22 -19 +3 23 -20 +3 24 -21 +3 25 -22 +3 26 -23 +3 27 -24 +3 28 -25 +3 29 -26 +3 30 -27 +3 31 -28 +3 32 -29 +3 33 433.5 +3 34 432.5 +3 35 12345 +4 4 1 +4 5 1 +4 33 -2 +4 35 12345 +5 5 1 +5 6 1 +5 33 -2 +5 35 12345 +6 6 1 +6 7 1 +6 33 -2 +6 35 12345 +7 7 1 +7 8 1 +7 33 -2 +7 35 12345 +8 8 1 +8 9 1 +8 33 -2 +8 35 12345 +9 9 1 +9 10 1 +9 33 -2 +9 35 12345 +10 10 1 +10 11 1 +10 33 -2 +10 35 12345 +11 11 1 +11 12 1 +11 33 -2 +11 35 12345 +12 12 1 +12 13 1 +12 33 -2 +12 35 12345 +13 13 1 +13 14 1 +13 33 -2 +13 35 12345 +14 14 1 +14 15 1 +14 33 -2 +14 35 12345 +15 15 1 +15 16 1 +15 33 -2 +15 35 12345 +16 16 1 +16 17 1 +16 33 -2 +16 35 12345 +17 17 1 +17 18 1 +17 33 -2 +17 35 12345 +18 18 1 +18 19 1 +18 33 -2 +18 35 12345 +19 19 1 +19 20 1 +19 33 -2 +19 35 12345 +20 20 1 +20 21 1 +20 33 -2 +20 35 12345 +21 21 1 +21 22 1 +21 33 -2 +21 35 12345 +22 22 1 +22 23 1 +22 33 -2 +22 35 12345 +23 23 1 +23 24 1 +23 33 -2 +23 35 12345 +24 24 1 +24 25 1 +24 33 -2 +24 35 12345 +25 25 1 +25 26 1 +25 33 -2 +25 35 12345 +26 26 1 +26 27 1 +26 33 -2 +26 35 12345 +27 27 1 +27 28 1 +27 33 -2 +27 35 12345 +28 28 1 +28 29 1 +28 33 -2 +28 35 12345 +29 29 1 +29 30 1 +29 33 -2 +29 35 12345 +30 30 1 +30 31 1 +30 33 -2 +30 35 12345 +31 31 1 +31 32 1 +31 33 -2 +31 35 12345 +32 32 1 +32 33 -1 +32 35 12345 +33 33 1 +33 34 1 +34 34 1 +34 35 12345 +35 35 1 diff --git a/matrices/test/isai_u_inv_partial.mtx b/matrices/test/isai_u_inv_partial.mtx new file mode 100644 index 00000000000..50ab5203f68 --- /dev/null +++ b/matrices/test/isai_u_inv_partial.mtx @@ -0,0 +1,162 @@ +%%MatrixMarket matrix coordinate real general +35 35 160 +1 1 1 +1 2 1 +1 33 -2 +2 2 1 +2 3 0.5 +2 33 -2.5 +2 35 12345 +3 3 -1 +3 4 -1 +3 5 -1 +3 6 -1 +3 7 -1 +3 8 -1 +3 9 -1 +3 10 -1 +3 11 -1 +3 12 -1 +3 13 -1 +3 14 -1 +3 15 -1 +3 16 -1 +3 17 -1 +3 18 -1 +3 19 -1 +3 20 -1 +3 21 -1 +3 22 -1 +3 23 -1 +3 24 -1 +3 25 -1 +3 26 -1 +3 27 -1 +3 28 -1 +3 29 -1 +3 30 -1 +3 31 -1 +3 32 -1 +3 33 -1 +3 34 -1 +3 35 -1 +4 4 1 +4 5 1 +4 33 -2 +4 35 12345 +5 5 1 +5 6 1 +5 33 -2 +5 35 12345 +6 6 1 +6 7 1 +6 33 -2 +6 35 12345 +7 7 1 +7 8 1 +7 33 -2 +7 35 12345 +8 8 1 +8 9 1 +8 33 -2 +8 35 12345 +9 9 1 +9 10 1 +9 33 -2 +9 35 12345 +10 10 1 +10 11 1 +10 33 -2 +10 35 12345 +11 11 1 +11 12 1 +11 33 -2 +11 35 12345 +12 12 1 +12 13 1 +12 33 -2 +12 35 12345 +13 13 1 +13 14 1 +13 33 -2 +13 35 12345 +14 14 1 +14 15 1 +14 33 -2 +14 35 12345 +15 15 1 +15 16 1 +15 33 -2 +15 35 12345 +16 16 1 +16 17 1 +16 33 -2 +16 35 12345 +17 17 1 +17 18 1 +17 33 -2 +17 35 12345 +18 18 1 +18 19 1 +18 33 -2 +18 35 12345 +19 19 1 +19 20 1 +19 33 -2 +19 35 12345 +20 20 1 +20 21 1 +20 33 -2 +20 35 12345 +21 21 1 +21 22 1 +21 33 -2 +21 35 12345 +22 22 1 +22 23 1 +22 33 -2 +22 35 12345 +23 23 1 +23 24 1 +23 33 -2 +23 35 12345 +24 24 1 +24 25 1 +24 33 -2 +24 35 12345 +25 25 1 +25 26 1 +25 33 -2 +25 35 12345 +26 26 1 +26 27 1 +26 33 -2 +26 35 12345 +27 27 1 +27 28 1 +27 33 -2 +27 35 12345 +28 28 1 +28 29 1 +28 33 -2 +28 35 12345 +29 29 1 +29 30 1 +29 33 -2 +29 35 12345 +30 30 1 +30 31 1 +30 33 -2 +30 35 12345 +31 31 1 +31 32 1 +31 33 -2 +31 35 12345 +32 32 1 +32 33 -1 +32 35 12345 +33 33 1 +33 34 1 +34 34 1 +34 35 12345 +35 35 1 diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 3f8705b2d2b..8a26b5931a2 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -3,38 +3,58 @@ find_package(OpenMP REQUIRED) add_library(ginkgo_omp $ "") target_sources(ginkgo_omp PRIVATE - base/version.cpp - factorization/par_ilu_kernels.cpp - matrix/coo_kernels.cpp - matrix/csr_kernels.cpp - matrix/dense_kernels.cpp - matrix/ell_kernels.cpp - matrix/hybrid_kernels.cpp - matrix/sellp_kernels.cpp - matrix/sparsity_csr_kernels.cpp - preconditioner/jacobi_kernels.cpp - solver/bicgstab_kernels.cpp - solver/cg_kernels.cpp - solver/cgs_kernels.cpp - solver/fcg_kernels.cpp - solver/gmres_kernels.cpp - solver/ir_kernels.cpp - solver/lower_trs_kernels.cpp - solver/upper_trs_kernels.cpp - stop/criterion_kernels.cpp - stop/residual_norm_reduction_kernels.cpp) + base/version.cpp + components/fill_array.cpp + components/precision_conversion.cpp + components/prefix_sum.cpp + factorization/ilu_kernels.cpp + factorization/factorization_kernels.cpp + factorization/par_ict_kernels.cpp + factorization/par_ilu_kernels.cpp + factorization/par_ilut_kernels.cpp + matrix/coo_kernels.cpp + matrix/csr_kernels.cpp + matrix/dense_kernels.cpp + matrix/ell_kernels.cpp + matrix/hybrid_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + preconditioner/isai_kernels.cpp + preconditioner/jacobi_kernels.cpp + solver/bicg_kernels.cpp + solver/bicgstab_kernels.cpp + solver/cg_kernels.cpp + solver/cgs_kernels.cpp + solver/fcg_kernels.cpp + solver/gmres_kernels.cpp + solver/ir_kernels.cpp + solver/lower_trs_kernels.cpp + solver/upper_trs_kernels.cpp + stop/criterion_kernels.cpp + stop/residual_norm_kernels.cpp) ginkgo_compile_features(ginkgo_omp) + target_link_libraries(ginkgo_omp PRIVATE "${OpenMP_CXX_LIBRARIES}") -target_compile_options(ginkgo_omp PRIVATE "${OpenMP_CXX_FLAGS}") +target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}") +# We first separate the arguments, otherwise, the target_compile_options adds it as a string +# and the compiler is unhappy with the quotation marks. +separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}") +target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${GINKGO_COMPILER_FLAGS}") # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method target_link_libraries(ginkgo_omp PUBLIC ginkgo_cuda) +# Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method +target_link_libraries(ginkgo_omp PUBLIC ginkgo_hip) ginkgo_default_includes(ginkgo_omp) ginkgo_install_library(ginkgo_omp omp) +if (GINKGO_CHECK_CIRCULAR_DEPS) + ginkgo_check_headers(ginkgo_omp) +endif() + if(GINKGO_BUILD_TESTS) add_subdirectory(test) endif() diff --git a/omp/base/version.cpp b/omp/base/version.cpp index f9c7a68dcf3..d429e2cbcf1 100644 --- a/omp/base/version.cpp +++ b/omp/base/version.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/components/csr_spgeam.hpp b/omp/components/csr_spgeam.hpp new file mode 100644 index 00000000000..d88e612a0ae --- /dev/null +++ b/omp/components/csr_spgeam.hpp @@ -0,0 +1,115 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_ +#define GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_ + + +#include + + +#include + + +#include "core/base/utils.hpp" + + +namespace gko { +namespace kernels { +namespace omp { + + +/** + * Adds two (sorted) sparse matrices. + * + * Calls begin_cb(row) on each row to initialize row-local data + * Calls entry_cb(row, col, a_val, b_val, local_data) on each output non-zero + * Calls end_cb(row, local_data) on each row to finalize row-local data + * + * If the three functions are thread-safe, the whole invocation is. + */ +template +void abstract_spgeam(const matrix::Csr *a, + const matrix::Csr *b, + BeginCallback begin_cb, EntryCallback entry_cb, + EndCallback end_cb) +{ + auto num_rows = a->get_size()[0]; + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto b_vals = b->get_const_values(); + constexpr auto sentinel = std::numeric_limits::max(); +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto b_begin = b_row_ptrs[row]; + auto b_end = b_row_ptrs[row + 1]; + auto total_size = (a_end - a_begin) + (b_end - b_begin); + bool skip{}; + auto local_data = begin_cb(row); + for (IndexType i = 0; i < total_size; ++i) { + if (skip) { + skip = false; + continue; + } + // load column indices or sentinel + auto a_col = checked_load(a_col_idxs, a_begin, a_end, sentinel); + auto b_col = checked_load(b_col_idxs, b_begin, b_end, sentinel); + auto a_val = + checked_load(a_vals, a_begin, a_end, zero()); + auto b_val = + checked_load(b_vals, b_begin, b_end, zero()); + auto col = min(a_col, b_col); + // callback + entry_cb(row, col, a_col == col ? a_val : zero(), + b_col == col ? b_val : zero(), local_data); + // advance indices + a_begin += (a_col <= b_col); + b_begin += (b_col <= a_col); + skip = a_col == b_col; + } + end_cb(row, local_data); + } +} + + +} // namespace omp +} // namespace kernels +} // namespace gko + + +#endif // GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_ diff --git a/omp/components/fill_array.cpp b/omp/components/fill_array.cpp new file mode 100644 index 00000000000..60844522cbe --- /dev/null +++ b/omp/components/fill_array.cpp @@ -0,0 +1,60 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/fill_array.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace components { + + +template +void fill_array(std::shared_ptr exec, ValueType *array, + size_type n, ValueType val) +{ +#pragma omp parallel for + for (size_type i = 0; i < n; ++i) { + array[i] = val; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type); + + +} // namespace components +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/components/format_conversion.hpp b/omp/components/format_conversion.hpp index 35e98965207..1d2e9da9e46 100644 --- a/omp/components/format_conversion.hpp +++ b/omp/components/format_conversion.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -37,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + namespace gko { namespace kernels { namespace omp { diff --git a/omp/components/matrix_operations.hpp b/omp/components/matrix_operations.hpp index 3adb28095bb..7fa629811ee 100644 --- a/omp/components/matrix_operations.hpp +++ b/omp/components/matrix_operations.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/components/precision_conversion.cpp b/omp/components/precision_conversion.cpp new file mode 100644 index 00000000000..4c4553470a8 --- /dev/null +++ b/omp/components/precision_conversion.cpp @@ -0,0 +1,58 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/precision_conversion.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace components { + + +template +void convert_precision(std::shared_ptr exec, + size_type size, const SourceType *in, TargetType *out) +{ +#pragma omp parallel for + for (size_type i = 0; i < size; ++i) { + out[i] = in[i]; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); + + +} // namespace components +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/components/prefix_sum.cpp b/omp/components/prefix_sum.cpp new file mode 100644 index 00000000000..1375c1643dc --- /dev/null +++ b/omp/components/prefix_sum.cpp @@ -0,0 +1,63 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace components { + + +template +void prefix_sum(std::shared_ptr exec, IndexType *counts, + size_type num_entries) +{ + IndexType partial_sum{}; + for (IndexType i = 0; i < num_entries; ++i) { + auto nnz = counts[i]; + counts[i] = partial_sum; + partial_sum += nnz; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); + +// instantiate for size_type as well, as this is used in the Sellp format +template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); + + +} // namespace components +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp new file mode 100644 index 00000000000..f4b1f616444 --- /dev/null +++ b/omp/factorization/factorization_kernels.cpp @@ -0,0 +1,388 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/factorization_kernels.hpp" + + +#include +#include + + +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +namespace kernel { +namespace detail { + + +template +struct find_helper { + template + static inline bool find(ForwardIt first, ForwardIt last, IndexType value) + { + return std::find(first, last, value) != last; + } +}; + + +template <> +struct find_helper { + template + static inline bool find(ForwardIt first, ForwardIt last, IndexType value) + { + return std::binary_search(first, last, value); + } +}; + + +} // namespace detail + + +template +void find_missing_diagonal_elements( + const matrix::Csr *mtx, + IndexType *elements_to_add_per_row, bool *changes_required) +{ + auto num_rows = static_cast(mtx->get_size()[0]); + auto num_cols = static_cast(mtx->get_size()[1]); + auto col_idxs = mtx->get_const_col_idxs(); + auto row_ptrs = mtx->get_const_row_ptrs(); + bool local_change{false}; +#pragma omp parallel for reduction(|| : local_change) + for (IndexType row = 0; row < num_rows; ++row) { + if (row >= num_cols) { + elements_to_add_per_row[row] = 0; + continue; + } + const auto *start_cols = col_idxs + row_ptrs[row]; + const auto *end_cols = col_idxs + row_ptrs[row + 1]; + if (detail::find_helper::find(start_cols, end_cols, row)) { + elements_to_add_per_row[row] = 0; + } else { + elements_to_add_per_row[row] = 1; + local_change = true; + } + } + *changes_required = local_change; +} + + +template +void add_missing_diagonal_elements(const matrix::Csr *mtx, + ValueType *new_values, + IndexType *new_col_idxs, + const IndexType *row_ptrs_addition) +{ + const auto num_rows = static_cast(mtx->get_size()[0]); + const auto old_values = mtx->get_const_values(); + const auto old_col_idxs = mtx->get_const_col_idxs(); + const auto row_ptrs = mtx->get_const_row_ptrs(); +#pragma omp parallel for + for (IndexType row = 0; row < num_rows; ++row) { + const IndexType old_row_start{row_ptrs[row]}; + const IndexType old_row_end{row_ptrs[row + 1]}; + const IndexType new_row_start{old_row_start + row_ptrs_addition[row]}; + const IndexType new_row_end{old_row_end + row_ptrs_addition[row + 1]}; + + // if no element needs to be added, do a simple copy + if (new_row_end - new_row_start == old_row_end - old_row_start) { + for (IndexType i = 0; i < new_row_end - new_row_start; ++i) { + const IndexType new_idx = new_row_start + i; + const IndexType old_idx = old_row_start + i; + new_values[new_idx] = old_values[old_idx]; + new_col_idxs[new_idx] = old_col_idxs[old_idx]; + } + } else { + IndexType new_idx = new_row_start; + bool diagonal_added{false}; + for (IndexType old_idx = old_row_start; old_idx < old_row_end; + ++old_idx) { + const auto col_idx = old_col_idxs[old_idx]; + if (!diagonal_added && row < col_idx) { + new_values[new_idx] = zero(); + new_col_idxs[new_idx] = row; + ++new_idx; + diagonal_added = true; + } + new_values[new_idx] = old_values[old_idx]; + new_col_idxs[new_idx] = col_idx; + ++new_idx; + } + if (!diagonal_added) { + new_values[new_idx] = zero(); + new_col_idxs[new_idx] = row; + diagonal_added = true; + } + } + } +} + + +} // namespace kernel + + +template +void add_diagonal_elements(std::shared_ptr exec, + matrix::Csr *mtx, + bool is_sorted) +{ + auto mtx_size = mtx->get_size(); + size_type row_ptrs_size = mtx_size[0] + 1; + Array row_ptrs_addition{exec, row_ptrs_size}; + bool needs_change{}; + if (is_sorted) { + kernel::find_missing_diagonal_elements( + mtx, row_ptrs_addition.get_data(), &needs_change); + } else { + kernel::find_missing_diagonal_elements( + mtx, row_ptrs_addition.get_data(), &needs_change); + } + if (!needs_change) { + return; + } + + row_ptrs_addition.get_data()[row_ptrs_size - 1] = 0; + components::prefix_sum(exec, row_ptrs_addition.get_data(), row_ptrs_size); + + size_type new_num_elems = mtx->get_num_stored_elements() + + row_ptrs_addition.get_data()[row_ptrs_size - 1]; + Array new_values{exec, new_num_elems}; + Array new_col_idxs{exec, new_num_elems}; + kernel::add_missing_diagonal_elements(mtx, new_values.get_data(), + new_col_idxs.get_data(), + row_ptrs_addition.get_const_data()); + + auto old_row_ptrs_ptr = mtx->get_row_ptrs(); + auto row_ptrs_addition_ptr = row_ptrs_addition.get_const_data(); +#pragma omp parallel for + for (IndexType i = 0; i < row_ptrs_size; ++i) { + old_row_ptrs_ptr[i] += row_ptrs_addition_ptr[i]; + } + + matrix::CsrBuilder mtx_builder{mtx}; + mtx_builder.get_value_array() = std::move(new_values); + mtx_builder.get_col_idx_array() = std::move(new_col_idxs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); + + +template +void initialize_row_ptrs_l_u( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs, IndexType *u_row_ptrs) +{ + auto num_rows = system_matrix->get_size()[0]; + auto row_ptrs = system_matrix->get_const_row_ptrs(); + auto col_idxs = system_matrix->get_const_col_idxs(); + +// Calculate the NNZ per row first +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + size_type l_nnz{}; + size_type u_nnz{}; + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + size_type col = col_idxs[el]; + // don't count diagonal + l_nnz += col < row; + u_nnz += col > row; + } + // add diagonal again + l_row_ptrs[row] = l_nnz + 1; + u_row_ptrs[row] = u_nnz + 1; + } + + // Now, compute the prefix-sum, to get proper row_ptrs for L and U + components::prefix_sum(exec, l_row_ptrs, num_rows + 1); + components::prefix_sum(exec, u_row_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); + + +template +void initialize_l_u(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, + matrix::Csr *csr_u) +{ + const auto row_ptrs = system_matrix->get_const_row_ptrs(); + const auto col_idxs = system_matrix->get_const_col_idxs(); + const auto vals = system_matrix->get_const_values(); + + const auto row_ptrs_l = csr_l->get_const_row_ptrs(); + auto col_idxs_l = csr_l->get_col_idxs(); + auto vals_l = csr_l->get_values(); + + const auto row_ptrs_u = csr_u->get_const_row_ptrs(); + auto col_idxs_u = csr_u->get_col_idxs(); + auto vals_u = csr_u->get_values(); + +#pragma omp parallel for + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + size_type current_index_l = row_ptrs_l[row]; + size_type current_index_u = + row_ptrs_u[row] + 1; // we treat the diagonal separately + // if there is no diagonal value, set it to 1 by default + auto diag_val = one(); + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + const auto col = col_idxs[el]; + const auto val = vals[el]; + if (col < row) { + col_idxs_l[current_index_l] = col; + vals_l[current_index_l] = val; + ++current_index_l; + } else if (col == row) { + // save value for later + diag_val = val; + } else { // col > row + col_idxs_u[current_index_u] = col; + vals_u[current_index_u] = val; + ++current_index_u; + } + } + // store diagonal entries + size_type l_diag_idx = row_ptrs_l[row + 1] - 1; + size_type u_diag_idx = row_ptrs_u[row]; + col_idxs_l[l_diag_idx] = row; + col_idxs_u[u_diag_idx] = row; + vals_l[l_diag_idx] = one(); + vals_u[u_diag_idx] = diag_val; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); + + +template +void initialize_row_ptrs_l( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs) +{ + auto num_rows = system_matrix->get_size()[0]; + auto row_ptrs = system_matrix->get_const_row_ptrs(); + auto col_idxs = system_matrix->get_const_col_idxs(); + +// Calculate the NNZ per row first +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + size_type l_nnz{}; + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + size_type col = col_idxs[el]; + // skip diagonal + l_nnz += col < row; + } + // add diagonal again + l_row_ptrs[row] = l_nnz + 1; + } + + // Now, compute the prefix-sum, to get proper row_ptrs for L + components::prefix_sum(exec, l_row_ptrs, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); + + +template +void initialize_l(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, bool diag_sqrt) +{ + const auto row_ptrs = system_matrix->get_const_row_ptrs(); + const auto col_idxs = system_matrix->get_const_col_idxs(); + const auto vals = system_matrix->get_const_values(); + + const auto row_ptrs_l = csr_l->get_const_row_ptrs(); + auto col_idxs_l = csr_l->get_col_idxs(); + auto vals_l = csr_l->get_values(); + +#pragma omp parallel for + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + size_type current_index_l = row_ptrs_l[row]; + // if there is no diagonal value, set it to 1 by default + auto diag_val = one(); + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + const auto col = col_idxs[el]; + const auto val = vals[el]; + if (col < row) { + col_idxs_l[current_index_l] = col; + vals_l[current_index_l] = val; + ++current_index_l; + } else if (col == row) { + // save value for later + diag_val = val; + } + } + // store diagonal entries + size_type l_diag_idx = row_ptrs_l[row + 1] - 1; + col_idxs_l[l_diag_idx] = row; + // compute square root with sentinel + if (diag_sqrt) { + diag_val = sqrt(diag_val); + if (!is_finite(diag_val)) { + diag_val = one(); + } + } + vals_l[l_diag_idx] = diag_val; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp new file mode 100644 index 00000000000..77f30a9a753 --- /dev/null +++ b/omp/factorization/ilu_kernels.cpp @@ -0,0 +1,58 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/ilu_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The ilu factorization namespace. + * + * @ingroup factor + */ +namespace ilu_factorization { + + +template +void compute_lu(std::shared_ptr exec, + matrix::Csr *m) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ILU_COMPUTE_LU_KERNEL); + + +} // namespace ilu_factorization +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp new file mode 100644 index 00000000000..15d80ab8755 --- /dev/null +++ b/omp/factorization/par_ict_kernels.cpp @@ -0,0 +1,207 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include + + +#include +#include +#include +#include + + +#include "core/base/utils.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "omp/components/csr_spgeam.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1]; + ++l_nz) { + auto col = l_col_idxs[l_nz]; + // find value from A + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto a_nz_it = + std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col); + auto a_nz = std::distance(a_col_idxs, a_nz_it); + auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col; + auto a_val = has_a ? a_vals[a_nz] : zero(); + // accumulate l(row,:) * l(col,:) without the last entry l(col, col) + ValueType sum{}; + IndexType lt_nz{}; + auto l_begin = l_row_ptrs[row]; + auto l_end = l_row_ptrs[row + 1]; + auto lt_begin = l_row_ptrs[col]; + auto lt_end = l_row_ptrs[col + 1]; + while (l_begin < l_end && lt_begin < lt_end) { + auto l_col = l_col_idxs[l_begin]; + auto lt_row = l_col_idxs[lt_begin]; + if (l_col == lt_row && l_col < col) { + sum += l_vals[l_begin] * l_vals[lt_begin]; + } + if (lt_row == row) { + lt_nz = lt_begin; + } + l_begin += (l_col <= lt_row); + lt_begin += (lt_row <= l_col); + } + auto new_val = a_val - sum; + if (row == col) { + new_val = sqrt(new_val); + } else { + auto diag = l_vals[l_row_ptrs[col + 1] - 1]; + new_val = new_val / diag; + } + if (is_finite(new_val)) { + l_vals[l_nz] = new_val; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *llt, + const matrix::Csr *a, + const matrix::Csr *l, + matrix::Csr *l_new) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + constexpr auto sentinel = std::numeric_limits::max(); + // count nnz + abstract_spgeam( + a, llt, [](IndexType) { return IndexType{}; }, + [](IndexType row, IndexType col, ValueType, ValueType, IndexType &nnz) { + nnz += col <= row; + }, + [&](IndexType row, IndexType nnz) { l_new_row_ptrs[row] = nnz; }); + + components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); + + // resize arrays + auto l_nnz = l_new_row_ptrs[num_rows]; + matrix::CsrBuilder l_builder{l_new}; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // accumulate non-zeros + struct row_state { + IndexType l_new_nz; + IndexType l_old_begin; + IndexType l_old_end; + }; + abstract_spgeam( + a, llt, + [&](IndexType row) { + row_state state{}; + state.l_new_nz = l_new_row_ptrs[row]; + state.l_old_begin = l_row_ptrs[row]; + state.l_old_end = l_row_ptrs[row + 1]; + return state; + }, + [&](IndexType row, IndexType col, ValueType a_val, ValueType llt_val, + row_state &state) { + auto r_val = a_val - llt_val; + // load matching entry of L + auto l_col = checked_load(l_col_idxs, state.l_old_begin, + state.l_old_end, sentinel); + auto l_val = checked_load(l_vals, state.l_old_begin, + state.l_old_end, zero()); + // load diagonal entry of L + auto diag = l_vals[l_row_ptrs[col + 1] - 1]; + // if there is already an entry present, use that + // instead. + auto out_val = l_col == col ? l_val : r_val / diag; + // store output entries + if (row >= col) { + l_new_col_idxs[state.l_new_nz] = col; + l_new_vals[state.l_new_nz] = out_val; + state.l_new_nz++; + } + // advance entry of L if we used it + state.l_old_begin += (l_col == col); + }, + [](IndexType, row_state) {}); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ict_factorization +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp index 4c06f2337b0..d658c0da579 100644 --- a/omp/factorization/par_ilu_kernels.cpp +++ b/omp/factorization/par_ilu_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/par_ilu_kernels.hpp" +#include + + #include #include #include @@ -49,102 +52,6 @@ namespace omp { namespace par_ilu_factorization { -template -void initialize_row_ptrs_l_u( - std::shared_ptr exec, - const matrix::Csr *system_matrix, - IndexType *l_row_ptrs, IndexType *u_row_ptrs) -{ - auto row_ptrs = system_matrix->get_const_row_ptrs(); - auto col_idxs = system_matrix->get_const_col_idxs(); - - l_row_ptrs[0] = 0; - u_row_ptrs[0] = 0; -// Calculate the NNZ per row first -#pragma omp parallel for - for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { - size_type l_nnz{}; - size_type u_nnz{}; - for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { - size_type col = col_idxs[el]; - if (col <= row) { - ++l_nnz; - } - if (col >= row) { - ++u_nnz; - } - } - l_row_ptrs[row + 1] = l_nnz; - u_row_ptrs[row + 1] = u_nnz; - } - - // Now, compute the prefix-sum, to get proper row_ptrs for L and U - IndexType l_previous_nnz{}; - IndexType u_previous_nnz{}; - for (size_type row = 1; row < system_matrix->get_size()[0] + 1; ++row) { - l_previous_nnz += l_row_ptrs[row]; - u_previous_nnz += u_row_ptrs[row]; - - l_row_ptrs[row] = l_previous_nnz; - u_row_ptrs[row] = u_previous_nnz; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL); - - -template -void initialize_l_u(std::shared_ptr exec, - const matrix::Csr *system_matrix, - matrix::Csr *csr_l, - matrix::Csr *csr_u) -{ - const auto row_ptrs = system_matrix->get_const_row_ptrs(); - const auto col_idxs = system_matrix->get_const_col_idxs(); - const auto vals = system_matrix->get_const_values(); - - const auto row_ptrs_l = csr_l->get_const_row_ptrs(); - auto col_idxs_l = csr_l->get_col_idxs(); - auto vals_l = csr_l->get_values(); - - const auto row_ptrs_u = csr_u->get_const_row_ptrs(); - auto col_idxs_u = csr_u->get_col_idxs(); - auto vals_u = csr_u->get_values(); - -#pragma omp parallel for - for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { - size_type current_index_l = row_ptrs_l[row]; - size_type current_index_u = row_ptrs_u[row]; - for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { - const auto col = col_idxs[el]; - const auto val = vals[el]; - if (col < row) { - col_idxs_l[current_index_l] = col; - vals_l[current_index_l] = val; - ++current_index_l; - } else if (col == row) { - // Update both L and U - col_idxs_l[current_index_l] = col; - vals_l[current_index_l] = one(); - ++current_index_l; - - col_idxs_u[current_index_u] = col; - vals_u[current_index_u] = val; - ++current_index_u; - } else { // col > row - col_idxs_u[current_index_u] = col; - vals_u[current_index_u] = val; - ++current_index_u; - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL); - - template void compute_l_u_factors(std::shared_ptr exec, size_type iterations, @@ -198,12 +105,12 @@ void compute_l_u_factors(std::shared_ptr exec, if (row > col) { // modify entry in L auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1]; - if (isfinite(to_write)) { + if (is_finite(to_write)) { vals_l[row_l - 1] = to_write; } } else { // modify entry in U auto to_write = sum; - if (isfinite(to_write)) { + if (is_finite(to_write)) { vals_u[row_u - 1] = to_write; } } diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp new file mode 100644 index 00000000000..1e7d4988c5c --- /dev/null +++ b/omp/factorization/par_ilut_kernels.cpp @@ -0,0 +1,470 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/base/utils.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "omp/components/csr_spgeam.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp, + Array> &, + remove_complex &threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + tmp.resize_and_reset(size); + std::copy_n(values, size, tmp.get_data()); + + auto begin = tmp.get_data(); + auto target = begin + rank; + auto end = begin + size; + std::nth_element(begin, target, end, + [](ValueType a, ValueType b) { return abs(a) < abs(b); }); + threshold = abs(*target); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +/** + * Removes all the elements from the input matrix for which pred is false. + * Stores the result in m_out and (if non-null) m_out_coo. + * pred(row, nz) is called for each entry, where nz is the index in + * values/col_idxs. + */ +template +void abstract_filter(std::shared_ptr exec, + const matrix::Csr *m, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, + Predicate pred) +{ + auto num_rows = m->get_size()[0]; + auto row_ptrs = m->get_const_row_ptrs(); + auto col_idxs = m->get_const_col_idxs(); + auto vals = m->get_const_values(); + + // first sweep: count nnz for each row + auto new_row_ptrs = m_out->get_row_ptrs(); + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + IndexType count{}; + for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; ++nz) { + count += pred(row, nz); + } + new_row_ptrs[row] = count; + } + + // build row pointers + components::prefix_sum(exec, new_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = new_row_ptrs[num_rows]; + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType *new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + Array::view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + Array::view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto new_nz = new_row_ptrs[row]; + auto begin = row_ptrs[row]; + auto end = row_ptrs[row + 1]; + for (auto nz = begin; nz < end; ++nz) { + if (pred(row, nz)) { + if (new_row_idxs) { + new_row_idxs[new_nz] = row; + } + new_col_idxs[new_nz] = col_idxs[nz]; + new_vals[new_nz] = vals[nz]; + ++new_nz; + } + } + } +} + + +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr *m, + remove_complex threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, bool) +{ + auto col_idxs = m->get_const_col_idxs(); + auto vals = m->get_const_values(); + abstract_filter( + exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) { + return abs(vals[nz]) >= threshold || col_idxs[nz] == row; + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +constexpr auto bucket_count = 1 << sampleselect_searchtree_height; +constexpr auto sample_size = bucket_count * sampleselect_oversampling; + + +template +void threshold_filter_approx(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp, + remove_complex &threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo) +{ + auto vals = m->get_const_values(); + auto col_idxs = m->get_const_col_idxs(); + auto size = static_cast(m->get_num_stored_elements()); + using AbsType = remove_complex; + auto num_threads = omp_get_max_threads(); + auto storage_size = + ceildiv(sample_size * sizeof(AbsType) + + bucket_count * (num_threads + 1) * sizeof(IndexType), + sizeof(ValueType)); + tmp.resize_and_reset(storage_size); + // pick and sort sample + auto sample = reinterpret_cast(tmp.get_data()); + // assuming rounding towards zero + auto stride = double(size) / sample_size; + for (IndexType i = 0; i < sample_size; ++i) { + sample[i] = abs(vals[static_cast(i * stride)]); + } + std::sort(sample, sample + sample_size); + // pick splitters + for (IndexType i = 0; i < bucket_count - 1; ++i) { + // shift by one so we get upper bounds for the buckets + sample[i] = sample[(i + 1) * sampleselect_oversampling]; + } + // count elements per bucket + auto total_histogram = reinterpret_cast(sample + bucket_count); + for (IndexType bucket = 0; bucket < bucket_count; ++bucket) { + total_histogram[bucket] = 0; + } +#pragma omp parallel + { + auto local_histogram = + total_histogram + (omp_get_thread_num() + 1) * bucket_count; + for (IndexType bucket = 0; bucket < bucket_count; ++bucket) { + local_histogram[bucket] = 0; + } +#pragma omp for + for (IndexType nz = 0; nz < size; ++nz) { + auto bucket_it = std::upper_bound(sample, sample + bucket_count - 1, + abs(vals[nz])); + auto bucket = std::distance(sample, bucket_it); + // smallest bucket s.t. sample[bucket] >= abs(val[nz]) + local_histogram[bucket]++; + } + for (IndexType bucket = 0; bucket < bucket_count; ++bucket) { +#pragma omp atomic + total_histogram[bucket] += local_histogram[bucket]; + } + } + // determine splitter ranks: prefix sum over bucket counts + components::prefix_sum(exec, total_histogram, bucket_count + 1); + // determine the bucket containing the threshold rank: + // prefix_sum[bucket] <= rank < prefix_sum[bucket + 1] + auto it = std::upper_bound(total_histogram, + total_histogram + bucket_count + 1, rank); + auto threshold_bucket = std::distance(total_histogram + 1, it); + // sample contains upper bounds for the buckets + threshold = threshold_bucket > 0 ? sample[threshold_bucket - 1] + : zero>(); + // filter elements + abstract_filter( + exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) { + auto bucket_it = std::upper_bound(sample, sample + bucket_count - 1, + abs(vals[nz])); + auto bucket = std::distance(sample, bucket_it); + return bucket >= threshold_bucket || col_idxs[nz] == row; + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); + + +template +void compute_l_u_factors(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *, + matrix::Csr *u, + const matrix::Coo *, + matrix::Csr *u_csc) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_values(); + auto ut_col_ptrs = u_csc->get_const_row_ptrs(); + auto ut_row_idxs = u_csc->get_const_col_idxs(); + auto ut_vals = u_csc->get_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + + auto compute_sum = [&](IndexType row, IndexType col) { + // find value from A + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto a_nz_it = + std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col); + auto a_nz = std::distance(a_col_idxs, a_nz_it); + auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col; + auto a_val = has_a ? a_vals[a_nz] : zero(); + // accumulate l(row,:) * u(:,col) without the last entry (row, col) + ValueType sum{}; + IndexType ut_nz{}; + auto l_begin = l_row_ptrs[row]; + auto l_end = l_row_ptrs[row + 1]; + auto u_begin = ut_col_ptrs[col]; + auto u_end = ut_col_ptrs[col + 1]; + auto last_entry = min(row, col); + while (l_begin < l_end && u_begin < u_end) { + auto l_col = l_col_idxs[l_begin]; + auto u_row = ut_row_idxs[u_begin]; + if (l_col == u_row && l_col < last_entry) { + sum += l_vals[l_begin] * ut_vals[u_begin]; + } + if (u_row == row) { + ut_nz = u_begin; + } + l_begin += (l_col <= u_row); + u_begin += (u_row <= l_col); + } + return std::make_pair(a_val - sum, ut_nz); + }; + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1] - 1; + ++l_nz) { + auto col = l_col_idxs[l_nz]; + auto u_diag = ut_vals[ut_col_ptrs[col + 1] - 1]; + auto new_val = compute_sum(row, col).first / u_diag; + if (is_finite(new_val)) { + l_vals[l_nz] = new_val; + } + } + for (size_type u_nz = u_row_ptrs[row]; u_nz < u_row_ptrs[row + 1]; + ++u_nz) { + auto col = u_col_idxs[u_nz]; + auto result = compute_sum(row, col); + auto new_val = result.first; + auto ut_nz = result.second; + if (is_finite(new_val)) { + u_vals[u_nz] = new_val; + ut_vals[ut_nz] = new_val; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *lu, + const matrix::Csr *a, + const matrix::Csr *l, + const matrix::Csr *u, + matrix::Csr *l_new, + matrix::Csr *u_new) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + constexpr auto sentinel = std::numeric_limits::max(); + // count nnz + abstract_spgeam( + a, lu, [](IndexType) { return std::pair{}; }, + [](IndexType row, IndexType col, ValueType, ValueType, + std::pair &nnzs) { + nnzs.first += col <= row; + nnzs.second += col >= row; + }, + [&](IndexType row, std::pair nnzs) { + l_new_row_ptrs[row] = nnzs.first; + u_new_row_ptrs[row] = nnzs.second; + }); + + components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum(exec, u_new_row_ptrs, num_rows + 1); + + // resize arrays + auto l_nnz = l_new_row_ptrs[num_rows]; + auto u_nnz = u_new_row_ptrs[num_rows]; + matrix::CsrBuilder l_builder{l_new}; + matrix::CsrBuilder u_builder{u_new}; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + u_builder.get_col_idx_array().resize_and_reset(u_nnz); + u_builder.get_value_array().resize_and_reset(u_nnz); + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + // accumulate non-zeros + struct row_state { + IndexType l_new_nz; + IndexType u_new_nz; + IndexType l_old_begin; + IndexType l_old_end; + IndexType u_old_begin; + IndexType u_old_end; + bool finished_l; + }; + abstract_spgeam( + a, lu, + [&](IndexType row) { + row_state state{}; + state.l_new_nz = l_new_row_ptrs[row]; + state.u_new_nz = u_new_row_ptrs[row]; + state.l_old_begin = l_row_ptrs[row]; + state.l_old_end = l_row_ptrs[row + 1] - 1; // skip diagonal + state.u_old_begin = u_row_ptrs[row]; + state.u_old_end = u_row_ptrs[row + 1]; + state.finished_l = (state.l_old_begin == state.l_old_end); + return state; + }, + [&](IndexType row, IndexType col, ValueType a_val, ValueType lu_val, + row_state &state) { + auto r_val = a_val - lu_val; + // load matching entry of L + U + auto lpu_col = state.finished_l + ? checked_load(u_col_idxs, state.u_old_begin, + state.u_old_end, sentinel) + : l_col_idxs[state.l_old_begin]; + auto lpu_val = + state.finished_l + ? checked_load(u_vals, state.u_old_begin, state.u_old_end, + zero()) + : l_vals[state.l_old_begin]; + // load diagonal entry of U for lower diagonal entries + auto diag = col < row ? u_vals[u_row_ptrs[col]] : one(); + // if there is already an entry present, use that instead. + auto out_val = lpu_col == col ? lpu_val : r_val / diag; + // store output entries + if (row >= col) { + l_new_col_idxs[state.l_new_nz] = col; + l_new_vals[state.l_new_nz] = + row == col ? one() : out_val; + state.l_new_nz++; + } + if (row <= col) { + u_new_col_idxs[state.u_new_nz] = col; + u_new_vals[state.u_new_nz] = out_val; + state.u_new_nz++; + } + // advance entry of L + U if we used it + if (state.finished_l) { + state.u_old_begin += (lpu_col == col); + } else { + state.l_old_begin += (lpu_col == col); + state.finished_l = (state.l_old_begin == state.l_old_end); + } + }, + [](IndexType, row_state) {}); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index 33254cb0f44..71eb4c93a45 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -156,8 +156,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr exec, template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Coo *source) + const matrix::Coo *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; @@ -176,8 +176,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Coo *source) + const matrix::Coo *source, + matrix::Dense *result) { auto coo_val = source->get_const_values(); auto coo_col = source->get_const_col_idxs(); diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 3b541cba94c..406ce327ff1 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include @@ -42,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -49,7 +49,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/allocator.hpp" #include "core/base/iterator_factory.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "omp/components/csr_spgeam.hpp" #include "omp/components/format_conversion.hpp" @@ -126,6 +130,238 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +template +void spgemm_insert_row(unordered_set &cols, + const matrix::Csr *c, + size_type row) +{ + auto row_ptrs = c->get_const_row_ptrs(); + auto col_idxs = c->get_const_col_idxs(); + cols.insert(col_idxs + row_ptrs[row], col_idxs + row_ptrs[row + 1]); +} + + +template +void spgemm_insert_row2(unordered_set &cols, + const matrix::Csr *a, + const matrix::Csr *b, + size_type row) +{ + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + for (size_type a_nz = a_row_ptrs[row]; + a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) { + auto a_col = a_col_idxs[a_nz]; + auto b_row = a_col; + cols.insert(b_col_idxs + b_row_ptrs[b_row], + b_col_idxs + b_row_ptrs[b_row + 1]); + } +} + + +template +void spgemm_accumulate_row(map &cols, + const matrix::Csr *c, + ValueType scale, size_type row) +{ + auto row_ptrs = c->get_const_row_ptrs(); + auto col_idxs = c->get_const_col_idxs(); + auto vals = c->get_const_values(); + for (size_type c_nz = row_ptrs[row]; c_nz < size_type(row_ptrs[row + 1]); + ++c_nz) { + auto c_col = col_idxs[c_nz]; + auto c_val = vals[c_nz]; + cols[c_col] += scale * c_val; + } +} + + +template +void spgemm_accumulate_row2(map &cols, + const matrix::Csr *a, + const matrix::Csr *b, + ValueType scale, size_type row) +{ + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto b_vals = b->get_const_values(); + for (size_type a_nz = a_row_ptrs[row]; + a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) { + auto a_col = a_col_idxs[a_nz]; + auto a_val = a_vals[a_nz]; + auto b_row = a_col; + for (size_type b_nz = b_row_ptrs[b_row]; + b_nz < size_type(b_row_ptrs[b_row + 1]); ++b_nz) { + auto b_col = b_col_idxs[b_nz]; + auto b_val = b_vals[b_nz]; + cols[b_col] += scale * a_val * b_val; + } + } +} + + +template +void spgemm(std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Csr *b, + matrix::Csr *c) +{ + auto num_rows = a->get_size()[0]; + + // first sweep: count nnz for each row + auto c_row_ptrs = c->get_row_ptrs(); + + unordered_set local_col_idxs(exec); +#pragma omp parallel for firstprivate(local_col_idxs) + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_col_idxs.clear(); + spgemm_insert_row2(local_col_idxs, a, b, a_row); + c_row_ptrs[a_row] = local_col_idxs.size(); + } + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = c_row_ptrs[num_rows]; + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + c_col_idxs_array.resize_and_reset(new_nnz); + c_vals_array.resize_and_reset(new_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + + map local_row_nzs(exec); +#pragma omp parallel for firstprivate(local_row_nzs) + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_row_nzs.clear(); + spgemm_accumulate_row2(local_row_nzs, a, b, one(), a_row); + // store result + auto c_nz = c_row_ptrs[a_row]; + for (auto pair : local_row_nzs) { + c_col_idxs[c_nz] = pair.first; + c_vals[c_nz] = pair.second; + ++c_nz; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); + + +template +void advanced_spgemm(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Csr *b, + const matrix::Dense *beta, + const matrix::Csr *d, + matrix::Csr *c) +{ + auto num_rows = a->get_size()[0]; + auto valpha = alpha->at(0, 0); + auto vbeta = beta->at(0, 0); + + // first sweep: count nnz for each row + auto c_row_ptrs = c->get_row_ptrs(); + + unordered_set local_col_idxs(exec); +#pragma omp parallel for firstprivate(local_col_idxs) + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_col_idxs.clear(); + spgemm_insert_row(local_col_idxs, d, a_row); + spgemm_insert_row2(local_col_idxs, a, b, a_row); + c_row_ptrs[a_row] = local_col_idxs.size(); + } + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = c_row_ptrs[num_rows]; + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + c_col_idxs_array.resize_and_reset(new_nnz); + c_vals_array.resize_and_reset(new_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + + map local_row_nzs(exec); +#pragma omp parallel for firstprivate(local_row_nzs) + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_row_nzs.clear(); + spgemm_accumulate_row(local_row_nzs, d, vbeta, a_row); + spgemm_accumulate_row2(local_row_nzs, a, b, valpha, a_row); + // store result + auto c_nz = c_row_ptrs[a_row]; + for (auto pair : local_row_nzs) { + c_col_idxs[c_nz] = pair.first; + c_vals[c_nz] = pair.second; + ++c_nz; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); + + +template +void spgeam(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Dense *beta, + const matrix::Csr *b, + matrix::Csr *c) +{ + auto num_rows = a->get_size()[0]; + auto valpha = alpha->at(0, 0); + auto vbeta = beta->at(0, 0); + + // first sweep: count nnz for each row + auto c_row_ptrs = c->get_row_ptrs(); + + abstract_spgeam( + a, b, [](IndexType) { return IndexType{}; }, + [](IndexType, IndexType, ValueType, ValueType, IndexType &nnz) { + ++nnz; + }, + [&](IndexType row, IndexType nnz) { c_row_ptrs[row] = nnz; }); + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = c_row_ptrs[num_rows]; + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + c_col_idxs_array.resize_and_reset(new_nnz); + c_vals_array.resize_and_reset(new_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + + abstract_spgeam( + a, b, [&](IndexType row) { return c_row_ptrs[row]; }, + [&](IndexType, IndexType col, ValueType a_val, ValueType b_val, + IndexType &nz) { + c_vals[nz] = valpha * a_val + vbeta * b_val; + c_col_idxs[nz] = col; + ++nz; + }, + [](IndexType, IndexType) {}); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); + + template void convert_row_ptrs_to_idxs(std::shared_ptr exec, const IndexType *ptrs, size_type num_rows, @@ -137,8 +373,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr exec, template void convert_to_coo(std::shared_ptr exec, - matrix::Coo *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Coo *result) { auto num_rows = result->get_size()[0]; @@ -154,8 +390,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -163,12 +399,11 @@ void convert_to_dense(std::shared_ptr exec, auto col_idxs = source->get_const_col_idxs(); auto vals = source->get_const_values(); - for (size_type row = 0; row < num_rows; ++row) { #pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { for (size_type col = 0; col < num_cols; ++col) { result->at(row, col) = zero(); } -#pragma omp parallel for for (size_type i = row_ptrs[row]; i < static_cast(row_ptrs[row + 1]); ++i) { result->at(row, col_idxs[i]) = vals[i]; @@ -182,8 +417,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sellp(std::shared_ptr exec, - matrix::Sellp *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Sellp *result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -192,15 +427,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_ell(std::shared_ptr exec, - matrix::Ell *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Ell *result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); -template +template inline void convert_csr_to_csc(size_type num_rows, const IndexType *row_ptrs, const IndexType *col_idxs, const ValueType *csr_vals, IndexType *row_idxs, @@ -245,8 +480,8 @@ void transpose_and_transform(std::shared_ptr exec, template void transpose(std::shared_ptr exec, - matrix::Csr *trans, - const matrix::Csr *orig) + const matrix::Csr *orig, + matrix::Csr *trans) { transpose_and_transform(exec, trans, orig, [](const ValueType x) { return x; }); @@ -257,8 +492,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, - matrix::Csr *trans, - const matrix::Csr *orig) + const matrix::Csr *orig, + matrix::Csr *trans) { transpose_and_transform(exec, trans, orig, [](const ValueType x) { return conj(x); }); @@ -289,8 +524,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, - matrix::Hybrid *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Hybrid *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -363,6 +598,144 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); +template +void row_permute_impl(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto orig_vals = orig->get_const_values(); + auto rp_row_ptrs = row_permuted->get_row_ptrs(); + auto rp_col_idxs = row_permuted->get_col_idxs(); + auto rp_vals = row_permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + size_type num_nnz = orig->get_num_stored_elements(); + + size_type cur_ptr = 0; + rp_row_ptrs[0] = cur_ptr; + vector orig_num_nnz_per_row(num_rows, 0, exec); +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + orig_num_nnz_per_row[row] = orig_row_ptrs[row + 1] - orig_row_ptrs[row]; + } + for (size_type row = 0; row < num_rows; ++row) { + rp_row_ptrs[row + 1] = + rp_row_ptrs[row] + orig_num_nnz_per_row[perm[row]]; + } + rp_row_ptrs[num_rows] = orig_row_ptrs[num_rows]; +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto new_row = perm[row]; + auto new_k = orig_row_ptrs[new_row]; + for (size_type k = rp_row_ptrs[row]; + k < size_type(rp_row_ptrs[row + 1]); ++k) { + rp_col_idxs[k] = orig_col_idxs[new_k]; + rp_vals[k] = orig_vals[new_k]; + new_k++; + } + } +} + + +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) +{ + row_permute_impl(exec, permutation_indices, orig, row_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); + Array inv_perm(*permutation_indices); + auto iperm = inv_perm.get_data(); +#pragma omp parallel for + for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) { + iperm[perm[ind]] = ind; + } + + row_permute_impl(exec, &inv_perm, orig, row_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void column_permute_impl(const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto orig_vals = orig->get_const_values(); + auto cp_row_ptrs = column_permuted->get_row_ptrs(); + auto cp_col_idxs = column_permuted->get_col_idxs(); + auto cp_vals = column_permuted->get_values(); + auto num_nnz = orig->get_num_stored_elements(); + size_type num_rows = orig->get_size()[0]; + size_type num_cols = orig->get_size()[1]; + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + cp_row_ptrs[row] = orig_row_ptrs[row]; + for (size_type k = orig_row_ptrs[row]; + k < size_type(orig_row_ptrs[row + 1]); ++k) { + cp_col_idxs[k] = perm[orig_col_idxs[k]]; + cp_vals[k] = orig_vals[k]; + } + } + cp_row_ptrs[num_rows] = orig_row_ptrs[num_rows]; +} + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); + Array inv_perm(*permutation_indices); + auto iperm = inv_perm.get_data(); +#pragma omp parallel for + for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) { + iperm[perm[ind]] = ind; + } + column_permute_impl(&inv_perm, orig, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) +{ + column_permute_impl(permutation_indices, orig, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + + template void calculate_nonzeros_per_row(std::shared_ptr exec, const matrix::Csr *source, diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 5654b40d753..a74bae1bd0f 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -49,6 +50,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/prefix_sum.hpp" + + namespace gko { namespace kernels { namespace omp { @@ -195,17 +199,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, - matrix::Dense *result) + matrix::Dense> *result) { - compute_dot(exec, x, x, result); - const size_type dim_0 = result->get_size()[0]; - const size_type dim_1 = result->get_size()[1]; -#pragma omp parallel for collapse(2) - for (size_type i = 0; i < dim_0; ++i) { - for (size_type j = 0; j < dim_1; ++j) { - result->at(i, j) = sqrt(abs(result->at(i, j))); + using norm_type = remove_complex; +#pragma omp parallel for + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) = zero(); + } +#pragma omp parallel for + for (size_type j = 0; j < x->get_size()[1]; ++j) { + for (size_type i = 0; i < x->get_size()[0]; ++i) { + result->at(0, j) += squared_norm(x->at(i, j)); } } +#pragma omp parallel for + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) = sqrt(result->at(0, j)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); @@ -213,8 +223,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); template void convert_to_coo(std::shared_ptr exec, - matrix::Coo *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Coo *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -223,20 +233,31 @@ void convert_to_coo(std::shared_ptr exec, auto row_idxs = result->get_row_idxs(); auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); + Array row_ptrs_array(exec, num_rows); + auto row_ptrs = row_ptrs_array.get_data(); - size_type idxs = 0; +#pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { + IndexType row_count{}; + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(row, col); + row_count += val != zero(); + } + row_ptrs[row] = row_count; + } + + components::prefix_sum(exec, row_ptrs, num_rows); + #pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto idxs = row_ptrs[row]; for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); if (val != zero()) { -#pragma omp critical - { - row_idxs[idxs] = row; - col_idxs[idxs] = col; - values[idxs] = val; - ++idxs; - } + row_idxs[idxs] = row; + col_idxs[idxs] = col; + values[idxs] = val; + ++idxs; } } } @@ -248,8 +269,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -259,22 +280,29 @@ void convert_to_csr(std::shared_ptr exec, auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); - size_type cur_ptr = 0; - row_ptrs[0] = cur_ptr; +#pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { + IndexType row_nnz{}; + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(row, col); + row_nnz += val != zero(); + } + row_ptrs[row] = row_nnz; + } + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + #pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto cur_ptr = row_ptrs[row]; for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); if (val != zero()) { -#pragma omp critical - { - col_idxs[cur_ptr] = col; - values[cur_ptr] = val; - ++cur_ptr; - } + col_idxs[cur_ptr] = col; + values[cur_ptr] = val; + ++cur_ptr; } } - row_ptrs[row + 1] = cur_ptr; } } @@ -284,8 +312,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_ell(std::shared_ptr exec, - matrix::Ell *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Ell *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -317,22 +345,24 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, - matrix::Hybrid *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Hybrid *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; auto strategy = result->get_strategy(); auto ell_lim = strategy->get_ell_num_stored_elements_per_row(); - auto coo_lim = strategy->get_coo_nnz(); auto coo_val = result->get_coo_values(); auto coo_col = result->get_coo_col_idxs(); auto coo_row = result->get_coo_row_idxs(); + Array coo_row_ptrs_array(exec, num_rows); + auto coo_row_ptrs = coo_row_ptrs_array.get_data(); -#pragma omp parallel for - for (size_type i = 0; i < result->get_ell_num_stored_elements_per_row(); - i++) { - for (size_type j = 0; j < result->get_ell_stride(); j++) { + auto ell_nnz_row = result->get_ell_num_stored_elements_per_row(); + auto ell_stride = result->get_ell_stride(); +#pragma omp parallel for collapse(2) + for (size_type i = 0; i < ell_nnz_row; i++) { + for (size_type j = 0; j < ell_stride; j++) { result->ell_val_at(j, i) = zero(); result->ell_col_at(j, i) = 0; } @@ -343,39 +373,39 @@ void convert_to_hybrid(std::shared_ptr exec, coo_col[i] = 0; coo_row[i] = 0; } +#pragma omp parallel for + for (size_type row = 0; row < num_rows; row++) { + size_type total_row_nnz{}; + for (size_type col = 0; col < num_cols; col++) { + auto val = source->at(row, col); + total_row_nnz += val != zero(); + } + coo_row_ptrs[row] = std::max(ell_lim, total_row_nnz) - ell_lim; + } + + components::prefix_sum(exec, coo_row_ptrs, num_rows); - size_type coo_idx = 0; - // FIXME: This parallelization may cause the COO part to not being sorted by - // row idx #pragma omp parallel for for (size_type row = 0; row < num_rows; row++) { - size_type col_idx = 0; + size_type ell_count = 0; size_type col = 0; - while (col < num_cols && col_idx < ell_lim) { + for (; col < num_cols && ell_count < ell_lim; col++) { auto val = source->at(row, col); if (val != zero()) { - result->ell_val_at(row, col_idx) = val; - result->ell_col_at(row, col_idx) = col; - col_idx++; + result->ell_val_at(row, ell_count) = val; + result->ell_col_at(row, ell_count) = col; + ell_count++; } - col++; } - while (col < num_cols) { + auto coo_idx = coo_row_ptrs[row]; + for (; col < num_cols; col++) { auto val = source->at(row, col); if (val != zero()) { - size_type current_coo_idx; - // Use the critical section for accessing the coo_idx only, the - // rest can be performed in parallel since the index is unique -#pragma omp critical - { - current_coo_idx = coo_idx; - ++coo_idx; - } - coo_val[current_coo_idx] = val; - coo_col[current_coo_idx] = col; - coo_row[current_coo_idx] = row; + coo_val[coo_idx] = val; + coo_col[coo_idx] = col; + coo_row[coo_idx] = row; + coo_idx++; } - col++; } } } @@ -386,8 +416,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sellp(std::shared_ptr exec, - matrix::Sellp *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Sellp *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -448,8 +478,11 @@ void convert_to_sellp(std::shared_ptr exec, } } } - slice_sets[slice_num] = - slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + + if (slice_num > 0) { + slice_sets[slice_num] = + slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -458,8 +491,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sparsity_csr(std::shared_ptr exec, - matrix::SparsityCsr *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::SparsityCsr *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -469,21 +502,28 @@ void convert_to_sparsity_csr(std::shared_ptr exec, auto value = result->get_value(); value[0] = one(); - size_type cur_ptr = 0; - row_ptrs[0] = cur_ptr; +#pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { + IndexType row_nnz{}; + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(row, col); + row_nnz += val != zero(); + } + row_ptrs[row] = row_nnz; + } + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + #pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto cur_ptr = row_ptrs[row]; for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); if (val != zero()) { -#pragma omp critical - { - col_idxs[cur_ptr] = col; - ++cur_ptr; - } + col_idxs[cur_ptr] = col; + ++cur_ptr; } } - row_ptrs[row + 1] = cur_ptr; } } @@ -592,8 +632,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void transpose(std::shared_ptr exec, - matrix::Dense *trans, - const matrix::Dense *orig) + const matrix::Dense *orig, + matrix::Dense *trans) { #pragma omp parallel for for (size_type i = 0; i < orig->get_size()[0]; ++i) { @@ -608,8 +648,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, - matrix::Dense *trans, - const matrix::Dense *orig) + const matrix::Dense *orig, + matrix::Dense *trans) { #pragma omp parallel for for (size_type i = 0; i < orig->get_size()[0]; ++i) { @@ -622,6 +662,81 @@ void conj_transpose(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL); +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); +#pragma omp parallel for + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + row_permuted->at(i, j) = orig->at(perm[i], j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); +#pragma omp parallel for + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + column_permuted->at(i, j) = orig->at(i, perm[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); +#pragma omp parallel for + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + row_permuted->at(perm[i], j) = orig->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); +#pragma omp parallel for + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + column_permuted->at(i, perm[j]) = orig->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL); + + } // namespace dense } // namespace omp } // namespace kernels diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index a487890be07..57bced52e9d 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -114,8 +114,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Ell *source) + const matrix::Ell *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -139,8 +139,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Ell *source) + const matrix::Ell *source, + matrix::Csr *result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -156,8 +156,8 @@ void count_nonzeros(std::shared_ptr exec, const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); const auto stride = source->get_stride(); - for (size_type row = 0; row < num_rows; row++) { #pragma omp parallel for reduction(+ : nonzeros) + for (size_type row = 0; row < num_rows; row++) { for (size_type i = 0; i < max_nnz_per_row; i++) { nonzeros += (source->val_at(row, i) != zero()); } diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp index 5453c94ac57..8282d7c7ab8 100644 --- a/omp/matrix/hybrid_kernels.cpp +++ b/omp/matrix/hybrid_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,8 +59,8 @@ namespace hybrid { template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Hybrid *source) + const matrix::Hybrid *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -99,8 +99,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Hybrid *source) + const matrix::Hybrid *source, + matrix::Csr *result) { auto csr_val = result->get_values(); auto csr_col_idxs = result->get_col_idxs(); diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp index 6212e1ac7b0..023dd7f5249 100644 --- a/omp/matrix/sellp_kernels.cpp +++ b/omp/matrix/sellp_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -125,8 +125,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Sellp *source) + const matrix::Sellp *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -161,8 +161,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Sellp *source) + const matrix::Sellp *source, + matrix::Csr *result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp index 830435fc376..22987b55287 100644 --- a/omp/matrix/sparsity_csr_kernels.cpp +++ b/omp/matrix/sparsity_csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -148,9 +148,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void remove_diagonal_elements(std::shared_ptr exec, - matrix::SparsityCsr *matrix, const IndexType *row_ptrs, - const IndexType *col_idxs) + const IndexType *col_idxs, + matrix::SparsityCsr *matrix) { auto num_rows = matrix->get_size()[0]; auto adj_ptrs = matrix->get_row_ptrs(); @@ -221,8 +221,8 @@ void transpose_and_transform( template void transpose(std::shared_ptr exec, - matrix::SparsityCsr *trans, - const matrix::SparsityCsr *orig) + const matrix::SparsityCsr *orig, + matrix::SparsityCsr *trans) { transpose_and_transform(exec, trans, orig); } diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp new file mode 100644 index 00000000000..98acbf68446 --- /dev/null +++ b/omp/preconditioner/isai_kernels.cpp @@ -0,0 +1,332 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/isai_kernels.hpp" + + +#include +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The Isai preconditioner namespace. + * + * @ingroup isai + */ +namespace isai { + + +template +void forall_matching(const IndexType *fst, IndexType fst_size, + const IndexType *snd, IndexType snd_size, Callback cb) +{ + IndexType fst_idx{}; + IndexType snd_idx{}; + while (fst_idx < fst_size && snd_idx < snd_size) { + const auto fst_val = fst[fst_idx]; + const auto snd_val = snd[snd_idx]; + if (fst_val == snd_val) { + cb(fst_val, fst_idx, snd_idx); + } + // advance the smaller entrie(s) + fst_idx += (fst_val <= snd_val); + snd_idx += (fst_val >= snd_val); + } +} + + +template +void generic_generate(std::shared_ptr exec, + const matrix::Csr *mtx, + matrix::Csr *inverse_mtx, + IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs, + Callable trs_solve) +{ + /* + Consider: aiM := inverse_mtx; M := mtx + I := Identity matrix + e(i) := unit vector i (containing all zeros except for row i, which is one) + S := Sparsity pattern of the desired aiM + S(i) := Sparsity pattern of row i of aiM (Set of non-zero columns) + D(i) := M[S(i), S(i)] + aiM := approximate inverse of M + + Target: Solving (aiM * M = I)_{S} (aiM * M = I for the sparsity pattern S) + aiM[i, :] * D(i) = e(i)^T + <=> D(i)^T * aiM[i, :]^T = e(i) =^ Triangular system (Trs) + Solve Trs, fill in aiM row by row (coalesced access) + */ + const auto num_rows = mtx->get_size()[0]; + const auto m_row_ptrs = mtx->get_const_row_ptrs(); + const auto m_cols = mtx->get_const_col_idxs(); + const auto m_vals = mtx->get_const_values(); + const auto i_row_ptrs = inverse_mtx->get_const_row_ptrs(); + const auto i_cols = inverse_mtx->get_const_col_idxs(); + auto i_vals = inverse_mtx->get_values(); + + auto num_threads = static_cast(omp_get_max_threads()); + // RHS for local trisystem + gko::Array rhs_array{exec, row_size_limit * num_threads}; + // memory for dense trisystem + gko::Array trisystem_array{ + exec, row_size_limit * row_size_limit * num_threads}; + +#pragma omp parallel + { + auto thread_num = static_cast(omp_get_thread_num()); + + auto rhs = rhs_array.get_data() + thread_num * row_size_limit; + auto trisystem_ptr = trisystem_array.get_data() + + thread_num * row_size_limit * row_size_limit; + +#pragma omp for + for (size_type row = 0; row < num_rows; ++row) { + const auto i_begin = i_row_ptrs[row]; + const auto i_size = i_row_ptrs[row + 1] - i_begin; + + if (i_size <= row_size_limit) { + // short rows: treat directly as dense system + excess_rhs_ptrs[row] = 0; + excess_nz_ptrs[row] = 0; + auto trisystem = range>( + trisystem_ptr, static_cast(i_size), + static_cast(i_size), + static_cast(i_size)); + std::fill_n(trisystem_ptr, i_size * i_size, zero()); + + for (size_type i = 0; i < i_size; ++i) { + const auto col = i_cols[i_begin + i]; + const auto m_begin = m_row_ptrs[col]; + const auto m_size = m_row_ptrs[col + 1] - m_begin; + forall_matching( + m_cols + m_begin, m_size, i_cols + i_begin, i_size, + [&](IndexType, IndexType m_idx, IndexType i_idx) { + trisystem(i, i_idx) = m_vals[m_idx + m_begin]; + }); + } + + // solve dense triangular system + trs_solve(trisystem, rhs); + + // write triangular solution to inverse + for (size_type i = 0; i < i_size; ++i) { + const auto new_val = rhs[i]; + const auto idx = i_begin + i; + // check for non-finite elements which should not be copied + // over + if (is_finite(new_val)) { + i_vals[idx] = new_val; + } else { + // ensure the preconditioner does not prevent + // convergence + i_vals[idx] = i_cols[idx] == row ? one() + : zero(); + } + } + } else { + // count non-zeros and dimension in the excess system + IndexType count{}; + for (size_type i = 0; i < i_size; ++i) { + const auto col = i_cols[i_begin + i]; + const auto m_begin = m_row_ptrs[col]; + const auto m_size = m_row_ptrs[col + 1] - m_begin; + forall_matching( + m_cols + m_begin, m_size, i_cols + i_begin, i_size, + [&](IndexType, IndexType, IndexType) { ++count; }); + } + excess_rhs_ptrs[row] = i_size; + excess_nz_ptrs[row] = count; + } + } + } + components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); + components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); +} + + +template +void generate_tri_inverse(std::shared_ptr exec, + const matrix::Csr *mtx, + matrix::Csr *inverse_mtx, + IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs, + bool lower) +{ + auto trs_solve = + [lower](const range> trisystem, + ValueType *rhs) { + const IndexType size = trisystem.length(0); + if (size <= 0) { + return; + } + // RHS is the identity: zero everywhere except for the diagonal + // entry + std::fill_n(rhs, size, zero()); + rhs[lower ? size - 1 : 0] = one(); + + // solve transposed triangular system + if (lower) { + for (auto col = size - 1; col >= 0; --col) { + const auto diag = trisystem(col, col); + const auto bot = rhs[col] / diag; + rhs[col] = bot; + // do a backwards substitution + for (auto row = col - 1; row >= 0; --row) { + rhs[row] -= bot * trisystem(col, row); + } + } + } else { + for (IndexType col = 0; col < size; ++col) { + const auto diag = trisystem(col, col); + const auto top = rhs[col] / diag; + rhs[col] = top; + // do a forward substitution + for (auto row = col + 1; row < size; ++row) { + rhs[row] -= top * trisystem(col, row); + } + } + } + }; + + generic_generate(exec, mtx, inverse_mtx, excess_rhs_ptrs, excess_nz_ptrs, + trs_solve); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); + + +template +void generate_excess_system(std::shared_ptr, + const matrix::Csr *input, + const matrix::Csr *inverse, + const IndexType *excess_rhs_ptrs, + const IndexType *excess_nz_ptrs, + matrix::Csr *excess_system, + matrix::Dense *excess_rhs) +{ + const auto num_rows = input->get_size()[0]; + const auto m_row_ptrs = input->get_const_row_ptrs(); + const auto m_cols = input->get_const_col_idxs(); + const auto m_vals = input->get_const_values(); + const auto i_row_ptrs = inverse->get_const_row_ptrs(); + const auto i_cols = inverse->get_const_col_idxs(); + const auto e_dim = excess_rhs->get_size()[0]; + auto e_row_ptrs = excess_system->get_row_ptrs(); + auto e_cols = excess_system->get_col_idxs(); + auto e_vals = excess_system->get_values(); + auto e_rhs = excess_rhs->get_values(); + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + const auto i_begin = i_row_ptrs[row]; + const auto i_size = i_row_ptrs[row + 1] - i_begin; + // first row index of the sparse block in the excess system + auto e_begin = excess_rhs_ptrs[row]; + // first non-zero index in the sparse block + auto e_nz = excess_nz_ptrs[row]; + + if (i_size > row_size_limit) { + // count non-zeros and dimension in the excess system + for (size_type i = 0; i < i_size; ++i) { + // current row in the excess system + const auto e_row = e_begin + i; + const auto col = i_cols[i_begin + i]; + const auto m_begin = m_row_ptrs[col]; + const auto m_size = m_row_ptrs[col + 1] - m_begin; + // store row pointers: one row per non-zero of inverse row + e_row_ptrs[e_row] = e_nz; + // build right-hand side: identity row + e_rhs[e_row] = + row == col ? one() : zero(); + // build sparse block + forall_matching( + m_cols + m_begin, m_size, i_cols + i_begin, i_size, + [&](IndexType, IndexType m_idx, IndexType i_idx) { + // trisystem(i, i_idx) = m_vals[m_idx + m_begin] + // just in sparse + e_cols[e_nz] = i_idx + e_begin; + e_vals[e_nz] = m_vals[m_idx + m_begin]; + ++e_nz; + }); + } + } + } + e_row_ptrs[e_dim] = excess_nz_ptrs[num_rows]; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); + + +template +void scatter_excess_solution(std::shared_ptr, + const IndexType *excess_block_ptrs, + const matrix::Dense *excess_solution, + matrix::Csr *inverse) +{ + const auto num_rows = inverse->get_size()[0]; + auto excess_values = excess_solution->get_const_values(); + auto values = inverse->get_values(); + auto row_ptrs = inverse->get_const_row_ptrs(); +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + const auto excess_begin = excess_values + excess_block_ptrs[row]; + const auto excess_end = excess_values + excess_block_ptrs[row + 1]; + auto values_begin = values + row_ptrs[row]; + std::copy(excess_begin, excess_end, values_begin); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 1d04410a47b..cf3e6fb4ec8 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include @@ -48,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/allocator.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "omp/components/matrix_operations.hpp" @@ -83,15 +85,9 @@ inline bool has_same_nonzero_pattern(const IndexType *prev_row_ptr, const IndexType *curr_row_ptr, const IndexType *next_row_ptr) { - if (next_row_ptr - curr_row_ptr != curr_row_ptr - prev_row_ptr) { - return false; - } - for (; curr_row_ptr < next_row_ptr; ++prev_row_ptr, ++curr_row_ptr) { - if (*curr_row_ptr != *prev_row_ptr) { - return false; - } - } - return true; + return std::distance(curr_row_ptr, next_row_ptr) == + std::distance(prev_row_ptr, curr_row_ptr) && + std::equal(curr_row_ptr, next_row_ptr, prev_row_ptr); } @@ -270,6 +266,24 @@ inline void transpose_block(IndexType block_size, const SourceValueType *from, } +template > +inline void conj_transpose_block(IndexType block_size, + const SourceValueType *from, + size_type from_stride, ResultValueType *to, + size_type to_stride, + ValueConverter converter = {}) noexcept +{ + for (IndexType i = 0; i < block_size; ++i) { + for (IndexType j = 0; j < block_size; ++j) { + to[i * to_stride + j] = conj(converter(from[i + j * from_stride])); + } + } +} + + template -inline bool validate_precision_reduction_feasibility(IndexType block_size, - const ValueType *block, - size_type stride) +inline bool validate_precision_reduction_feasibility( + std::shared_ptr exec, IndexType block_size, + const ValueType *block, size_type stride) { using gko::detail::float_traits; - std::vector tmp(block_size * block_size); - std::vector perm(block_size); + vector tmp(block_size * block_size, {}, exec); + vector perm(block_size, {}, exec); std::iota(begin(perm), end(perm), IndexType{0}); for (IndexType i = 0; i < block_size; ++i) { for (IndexType j = 0; j < block_size; ++j) { @@ -359,9 +373,9 @@ void generate(std::shared_ptr exec, const auto cond = conditioning.get_data(); #pragma omp parallel for for (size_type g = 0; g < num_blocks; g += group_size) { - std::vector> block(group_size); - std::vector> perm(group_size); - std::vector pr_descriptors(group_size, uint32{} - 1); + vector> block(group_size, {}, exec); + vector> perm(group_size, {}, exec); + vector pr_descriptors(group_size, uint32{} - 1, exec); // extract group of blocks, invert them, figure out storage precision for (size_type b = 0; b < group_size; ++b) { if (b + g >= num_blocks) { @@ -391,16 +405,18 @@ void generate(std::shared_ptr exec, using preconditioner::detail::get_supported_storage_reductions; pr_descriptors[b] = get_supported_storage_reductions( accuracy, cond[g + b], - [&block_size, &block, &b] { + [&exec, &block_size, &block, &b] { using target = reduce_precision; return validate_precision_reduction_feasibility( - block_size, block[b].get_const_data(), block_size); + exec, block_size, block[b].get_const_data(), + block_size); }, - [&block_size, &block, &b] { + [&exec, &block_size, &block, &b] { using target = reduce_precision>; return validate_precision_reduction_feasibility( - block_size, block[b].get_const_data(), block_size); + exec, block_size, block[b].get_const_data(), + block_size); }); } else { pr_descriptors[b] = preconditioner::detail:: @@ -553,6 +569,80 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); +template +void transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + const auto ptrs = block_pointers.get_const_data(); + const auto prec = block_precisions.get_const_data(); + const size_type matrix_size = ptrs[num_blocks]; + +#pragma omp parallel for + for (size_type i = 0; i < num_blocks; ++i) { + const auto group_ofs = storage_scheme.get_group_offset(i); + const auto block_ofs = storage_scheme.get_block_offset(i); + const auto block_stride = storage_scheme.get_stride(); + const auto group = blocks.get_const_data() + group_ofs; + auto out_group = out_blocks.get_data() + group_ofs; + const auto block_size = ptrs[i + 1] - ptrs[i]; + const auto p = prec ? prec[i] : precision_reduction(); + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, p, + transpose_block( + block_size, + reinterpret_cast(group) + block_ofs, + block_stride, + reinterpret_cast(out_group) + block_ofs, + block_stride)); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); + + +template +void conj_transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + const auto ptrs = block_pointers.get_const_data(); + const auto prec = block_precisions.get_const_data(); + const size_type matrix_size = ptrs[num_blocks]; + +#pragma omp parallel for + for (size_type i = 0; i < num_blocks; ++i) { + const auto group_ofs = storage_scheme.get_group_offset(i); + const auto block_ofs = storage_scheme.get_block_offset(i); + const auto block_stride = storage_scheme.get_stride(); + const auto group = blocks.get_const_data() + group_ofs; + auto out_group = out_blocks.get_data() + group_ofs; + const auto block_size = ptrs[i + 1] - ptrs[i]; + const auto p = prec ? prec[i] : precision_reduction(); + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, p, + conj_transpose_block( + block_size, + reinterpret_cast(group) + block_ofs, + block_stride, + reinterpret_cast(out_group) + block_ofs, + block_stride)); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); + + template void convert_to_dense( std::shared_ptr exec, size_type num_blocks, diff --git a/omp/solver/bicg_kernels.cpp b/omp/solver/bicg_kernels.cpp new file mode 100644 index 00000000000..d9e2864eedf --- /dev/null +++ b/omp/solver/bicg_kernels.cpp @@ -0,0 +1,147 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/bicg_kernels.hpp" + + +#include + + +#include +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The BICG solver namespace. + * + * @ingroup bicg + */ +namespace bicg { + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *z, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *prev_rho, + matrix::Dense *rho, matrix::Dense *r2, + matrix::Dense *z2, matrix::Dense *p2, + matrix::Dense *q2, + Array *stop_status) +{ +#pragma omp parallel for + for (size_type j = 0; j < b->get_size()[1]; ++j) { + rho->at(j) = zero(); + prev_rho->at(j) = one(); + stop_status->get_data()[j].reset(); + } +#pragma omp parallel for + for (size_type i = 0; i < b->get_size()[0]; ++i) { + for (size_type j = 0; j < b->get_size()[1]; ++j) { + r->at(i, j) = b->at(i, j); + r2->at(i, j) = b->at(i, j); + z->at(i, j) = p->at(i, j) = q->at(i, j) = zero(); + z2->at(i, j) = p2->at(i, j) = q2->at(i, j) = zero(); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + matrix::Dense *p, const matrix::Dense *z, + matrix::Dense *p2, const matrix::Dense *z2, + const matrix::Dense *rho, + const matrix::Dense *prev_rho, + const Array *stop_status) +{ +#pragma omp parallel for + for (size_type i = 0; i < p->get_size()[0]; ++i) { + for (size_type j = 0; j < p->get_size()[1]; ++j) { + if (stop_status->get_const_data()[j].has_stopped()) { + continue; + } + if (prev_rho->at(j) == zero()) { + p->at(i, j) = z->at(i, j); + p2->at(i, j) = z2->at(i, j); + } else { + auto tmp = rho->at(j) / prev_rho->at(j); + p->at(i, j) = z->at(i, j) + tmp * p->at(i, j); + p2->at(i, j) = z2->at(i, j) + tmp * p2->at(i, j); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + matrix::Dense *x, matrix::Dense *r, + matrix::Dense *r2, const matrix::Dense *p, + const matrix::Dense *q, + const matrix::Dense *q2, + const matrix::Dense *beta, + const matrix::Dense *rho, + const Array *stop_status) +{ +#pragma omp parallel for + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + if (stop_status->get_const_data()[j].has_stopped()) { + continue; + } + if (beta->at(j) != zero()) { + auto tmp = rho->at(j) / beta->at(j); + x->at(i, j) += tmp * p->at(i, j); + r->at(i, j) -= tmp * q->at(i, j); + r2->at(i, j) -= tmp * q2->at(i, j); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); + + +} // namespace bicg +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/solver/bicgstab_kernels.cpp b/omp/solver/bicgstab_kernels.cpp index 8f4149a73f4..d761fc044cf 100644 --- a/omp/solver/bicgstab_kernels.cpp +++ b/omp/solver/bicgstab_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,10 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/bicgstab_kernels.hpp" -#include +#include -#include +#include #include @@ -135,6 +135,17 @@ void step_2(std::shared_ptr exec, const matrix::Dense *beta, const Array *stop_status) { +#pragma omp parallel for + for (size_type j = 0; j < s->get_size()[1]; ++j) { + if (stop_status->get_const_data()[j].has_stopped()) { + continue; + } + if (beta->at(j) != zero()) { + alpha->at(j) = rho->at(j) / beta->at(j); + } else { + alpha->at(j) = zero(); + } + } #pragma omp parallel for for (size_type i = 0; i < s->get_size()[0]; ++i) { for (size_type j = 0; j < s->get_size()[1]; ++j) { @@ -142,10 +153,8 @@ void step_2(std::shared_ptr exec, continue; } if (beta->at(j) != zero()) { - alpha->at(j) = rho->at(j) / beta->at(j); s->at(i, j) = r->at(i, j) - alpha->at(j) * v->at(i, j); } else { - alpha->at(j) = zero(); s->at(i, j) = r->at(i, j); } } diff --git a/omp/solver/cg_kernels.cpp b/omp/solver/cg_kernels.cpp index 07590c0b6f0..b9a88f25761 100644 --- a/omp/solver/cg_kernels.cpp +++ b/omp/solver/cg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/solver/cgs_kernels.cpp b/omp/solver/cgs_kernels.cpp index 5e9995faae1..a0678788565 100644 --- a/omp/solver/cgs_kernels.cpp +++ b/omp/solver/cgs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -86,7 +86,6 @@ void initialize(std::shared_ptr exec, } } - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); diff --git a/omp/solver/fcg_kernels.cpp b/omp/solver/fcg_kernels.cpp index e9076d113c4..b8b69ee6d91 100644 --- a/omp/solver/fcg_kernels.cpp +++ b/omp/solver/fcg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -77,7 +77,6 @@ void initialize(std::shared_ptr exec, } } - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); diff --git a/omp/solver/gmres_kernels.cpp b/omp/solver/gmres_kernels.cpp index 066ffaf2178..6c44263bc75 100644 --- a/omp/solver/gmres_kernels.cpp +++ b/omp/solver/gmres_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -58,14 +58,14 @@ namespace { template -void finish_arnoldi(matrix::Dense *next_krylov_basis, - matrix::Dense *krylov_bases, +void finish_arnoldi(size_type num_rows, matrix::Dense *krylov_bases, matrix::Dense *hessenberg_iter, size_type iter, const stopping_status *stop_status) { -#pragma omp declare reduction(add : ValueType : omp_out = omp_out + omp_in) - - for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) { + const auto krylov_bases_rowoffset = num_rows; + const auto next_krylov_rowoffset = (iter + 1) * krylov_bases_rowoffset; +#pragma omp declare reduction(add:ValueType : omp_out = omp_out + omp_in) + for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { if (stop_status[i].has_stopped()) { continue; } @@ -73,20 +73,17 @@ void finish_arnoldi(matrix::Dense *next_krylov_basis, ValueType hessenberg_iter_entry = zero(); #pragma omp parallel for reduction(add : hessenberg_iter_entry) - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { + for (size_type j = 0; j < num_rows; ++j) { hessenberg_iter_entry += - next_krylov_basis->at(j, i) * - krylov_bases->at(j, - next_krylov_basis->get_size()[1] * k + i); + krylov_bases->at(j + next_krylov_rowoffset, i) * + krylov_bases->at(j + k * krylov_bases_rowoffset, i); } hessenberg_iter->at(k, i) = hessenberg_iter_entry; - #pragma omp parallel for - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { - next_krylov_basis->at(j, i) -= + for (size_type j = 0; j < num_rows; ++j) { + krylov_bases->at(j + next_krylov_rowoffset, i) -= hessenberg_iter->at(k, i) * - krylov_bases->at(j, - next_krylov_basis->get_size()[1] * k + i); + krylov_bases->at(j + k * krylov_bases_rowoffset, i); } } // for i in 1:iter @@ -97,20 +94,19 @@ void finish_arnoldi(matrix::Dense *next_krylov_basis, ValueType hessenberg_iter_entry = zero(); #pragma omp parallel for reduction(add : hessenberg_iter_entry) - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { + for (size_type j = 0; j < num_rows; ++j) { hessenberg_iter_entry += - next_krylov_basis->at(j, i) * next_krylov_basis->at(j, i); + krylov_bases->at(j + next_krylov_rowoffset, i) * + krylov_bases->at(j + next_krylov_rowoffset, i); } hessenberg_iter->at(iter + 1, i) = sqrt(hessenberg_iter_entry); - // hessenberg(iter, iter + 1) = norm(next_krylov_basis) +// hessenberg(iter + 1, iter) = norm(krylov_bases) #pragma omp parallel for - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { - next_krylov_basis->at(j, i) /= hessenberg_iter->at(iter + 1, i); - krylov_bases->at(j, next_krylov_basis->get_size()[1] * (iter + 1) + - i) = next_krylov_basis->at(j, i); + for (size_type j = 0; j < num_rows; ++j) { + krylov_bases->at(j + next_krylov_rowoffset, i) /= + hessenberg_iter->at(iter + 1, i); } // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis // End of arnoldi } } @@ -126,28 +122,26 @@ void calculate_sin_and_cos(matrix::Dense *givens_sin, givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { - auto hypotenuse = sqrt(hessenberg_iter->at(iter, rhs) * - hessenberg_iter->at(iter, rhs) + - hessenberg_iter->at(iter + 1, rhs) * - hessenberg_iter->at(iter + 1, rhs)); - givens_cos->at(iter, rhs) = - abs(hessenberg_iter->at(iter, rhs)) / hypotenuse; - givens_sin->at(iter, rhs) = givens_cos->at(iter, rhs) * - hessenberg_iter->at(iter + 1, rhs) / - hessenberg_iter->at(iter, rhs); + auto this_hess = hessenberg_iter->at(iter, rhs); + auto next_hess = hessenberg_iter->at(iter + 1, rhs); + const auto scale = abs(this_hess) + abs(next_hess); + const auto hypotenuse = + scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + + abs(next_hess / scale) * abs(next_hess / scale)); + givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse; + givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse; } } template -void givens_rotation(matrix::Dense *next_krylov_basis, - matrix::Dense *givens_sin, +void givens_rotation(matrix::Dense *givens_sin, matrix::Dense *givens_cos, matrix::Dense *hessenberg_iter, size_type iter, const stopping_status *stop_status) { #pragma omp parallel for - for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) { + for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { if (stop_status[i].has_stopped()) { continue; } @@ -155,13 +149,13 @@ void givens_rotation(matrix::Dense *next_krylov_basis, auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); hessenberg_iter->at(j + 1, i) = - -givens_sin->at(j, i) * hessenberg_iter->at(j, i) + - givens_cos->at(j, i) * hessenberg_iter->at(j + 1, i); + -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + + conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); hessenberg_iter->at(j, i) = temp; // temp = cos(j)*hessenberg(j) + // sin(j)*hessenberg(j+1) - // hessenberg(j+1) = -sin(j)*hessenberg(j) + - // cos(j)*hessenberg(j+1) + // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + + // conj(cos(j))*hessenberg(j+1) // hessenberg(j) = temp; } @@ -172,7 +166,7 @@ void givens_rotation(matrix::Dense *next_krylov_basis, givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); hessenberg_iter->at(iter + 1, i) = zero(); // hessenberg(iter) = cos(iter)*hessenberg(iter) + - // sin(iter)*hessenberg(iter) + // sin(iter)*hessenberg(iter + 1) // hessenberg(iter+1) = 0 } } @@ -181,9 +175,8 @@ void givens_rotation(matrix::Dense *next_krylov_basis, template void calculate_next_residual_norm( matrix::Dense *givens_sin, matrix::Dense *givens_cos, - matrix::Dense *residual_norm, - matrix::Dense *residual_norm_collection, - const matrix::Dense *b_norm, size_type iter, + matrix::Dense> *residual_norm, + matrix::Dense *residual_norm_collection, size_type iter, const stopping_status *stop_status) { #pragma omp parallel for @@ -192,11 +185,12 @@ void calculate_next_residual_norm( continue; } residual_norm_collection->at(iter + 1, i) = - -givens_sin->at(iter, i) * residual_norm_collection->at(iter, i); + -conj(givens_sin)->at(iter, i) * + residual_norm_collection->at(iter, i); residual_norm_collection->at(iter, i) = givens_cos->at(iter, i) * residual_norm_collection->at(iter, i); residual_norm->at(0, i) = - abs(residual_norm_collection->at(iter + 1, i)) / b_norm->at(0, i); + abs(residual_norm_collection->at(iter + 1, i)); } } @@ -231,14 +225,14 @@ void calculate_qy(const matrix::Dense *krylov_bases, matrix::Dense *before_preconditioner, const size_type *final_iter_nums) { + const auto krylov_bases_rowoffset = before_preconditioner->get_size()[0]; #pragma omp parallel for for (size_type i = 0; i < before_preconditioner->get_size()[0]; ++i) { for (size_type k = 0; k < before_preconditioner->get_size()[1]; ++k) { before_preconditioner->at(i, k) = zero(); for (size_type j = 0; j < final_iter_nums[k]; ++j) { before_preconditioner->at(i, k) += - krylov_bases->at( - i, j * before_preconditioner->get_size()[1] + k) * + krylov_bases->at(i + j * krylov_bases_rowoffset, k) * y->at(j, k); } } @@ -252,24 +246,13 @@ void calculate_qy(const matrix::Dense *krylov_bases, template void initialize_1(std::shared_ptr exec, const matrix::Dense *b, - matrix::Dense *b_norm, matrix::Dense *residual, matrix::Dense *givens_sin, matrix::Dense *givens_cos, Array *stop_status, size_type krylov_dim) { + using norm_type = remove_complex; for (size_type j = 0; j < b->get_size()[1]; ++j) { - // Calculate b norm - ValueType norm = zero(); - -#pragma omp declare reduction(add : ValueType : omp_out = omp_out + omp_in) - -#pragma omp parallel for reduction(add : norm) - for (size_type i = 0; i < b->get_size()[0]; ++i) { - norm += b->at(i, j) * b->at(i, j); - } - b_norm->at(0, j) = sqrt(norm); - #pragma omp parallel for for (size_type i = 0; i < b->get_size()[0]; ++i) { residual->at(i, j) = b->at(i, j); @@ -290,32 +273,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); template void initialize_2(std::shared_ptr exec, const matrix::Dense *residual, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, matrix::Dense *krylov_bases, Array *final_iter_nums, size_type krylov_dim) { + using norm_type = remove_complex; for (size_type j = 0; j < residual->get_size()[1]; ++j) { // Calculate residual norm - ValueType res_norm = zero(); - -#pragma omp declare reduction(add : ValueType : omp_out = omp_out + omp_in) + norm_type res_norm = zero(); +#pragma omp declare reduction(add:norm_type : omp_out = omp_out + omp_in) #pragma omp parallel for reduction(add : res_norm) for (size_type i = 0; i < residual->get_size()[0]; ++i) { - res_norm += residual->at(i, j) * residual->at(i, j); + res_norm += squared_norm(residual->at(i, j)); } residual_norm->at(0, j) = sqrt(res_norm); - -#pragma omp parallel for - for (size_type i = 0; i < krylov_dim + 1; ++i) { - if (i == 0) { - residual_norm_collection->at(i, j) = residual_norm->at(0, j); - } else { - residual_norm_collection->at(i, j) = zero(); - } - } - + residual_norm_collection->at(0, j) = residual_norm->at(0, j); #pragma omp parallel for for (size_type i = 0; i < residual->get_size()[0]; ++i) { krylov_bases->at(i, j) = @@ -323,29 +297,19 @@ void initialize_2(std::shared_ptr exec, } final_iter_nums->get_data()[j] = 0; } - -#pragma omp parallel for - for (size_type i = 0; i < krylov_bases->get_size()[0]; ++i) { - for (size_type j = residual->get_size()[1]; - j < krylov_bases->get_size()[1]; ++j) { - krylov_bases->at(i, j) = zero(); - } - } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); template -void step_1(std::shared_ptr exec, - matrix::Dense *next_krylov_basis, +void step_1(std::shared_ptr exec, size_type num_rows, matrix::Dense *givens_sin, matrix::Dense *givens_cos, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, matrix::Dense *krylov_bases, - matrix::Dense *hessenberg_iter, - const matrix::Dense *b_norm, size_type iter, + matrix::Dense *hessenberg_iter, size_type iter, Array *final_iter_nums, const Array *stop_status) { @@ -355,12 +319,12 @@ void step_1(std::shared_ptr exec, (1 - stop_status->get_const_data()[i].has_stopped()); } - finish_arnoldi(next_krylov_basis, krylov_bases, hessenberg_iter, iter, + finish_arnoldi(num_rows, krylov_bases, hessenberg_iter, iter, stop_status->get_const_data()); - givens_rotation(next_krylov_basis, givens_sin, givens_cos, hessenberg_iter, - iter, stop_status->get_const_data()); + givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, + stop_status->get_const_data()); calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, - residual_norm_collection, b_norm, iter, + residual_norm_collection, iter, stop_status->get_const_data()); } diff --git a/omp/solver/ir_kernels.cpp b/omp/solver/ir_kernels.cpp index df7f6ff87e0..ba68c407e95 100644 --- a/omp/solver/ir_kernels.cpp +++ b/omp/solver/ir_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp index bdfd73e94b1..af6a0670ea0 100644 --- a/omp/solver/lower_trs_kernels.cpp +++ b/omp/solver/lower_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp index ed1fdea3799..2fa0b6a3db9 100644 --- a/omp/solver/upper_trs_kernels.cpp +++ b/omp/solver/upper_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/stop/criterion_kernels.cpp b/omp/stop/criterion_kernels.cpp index b0f9517b980..ef8ff9f1221 100644 --- a/omp/stop/criterion_kernels.cpp +++ b/omp/stop/criterion_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/stop/residual_norm_reduction_kernels.cpp b/omp/stop/residual_norm_kernels.cpp similarity index 62% rename from omp/stop/residual_norm_reduction_kernels.cpp rename to omp/stop/residual_norm_kernels.cpp index 06e2485f6c0..1fd3a14cf85 100644 --- a/omp/stop/residual_norm_reduction_kernels.cpp +++ b/omp/stop/residual_norm_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,59 +30,64 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/stop/residual_norm_reduction_kernels.hpp" +#include "core/stop/residual_norm_kernels.hpp" #include #include +#include namespace gko { namespace kernels { namespace omp { /** - * @brief The Residual norm reduction stopping criterion namespace. + * @brief The Residual norm stopping criterion namespace. * @ref resnorm * @ingroup resnorm */ -namespace residual_norm_reduction { +namespace residual_norm { template -void residual_norm_reduction(std::shared_ptr exec, - const matrix::Dense *tau, - const matrix::Dense *orig_tau, - remove_complex rel_residual_goal, - uint8 stoppingId, bool setFinalized, - Array *stop_status, - Array *device_storage, bool *all_converged, - bool *one_changed) +void residual_norm(std::shared_ptr exec, + const matrix::Dense *tau, + const matrix::Dense *orig_tau, + ValueType rel_residual_goal, uint8 stoppingId, + bool setFinalized, Array *stop_status, + Array *device_storage, bool *all_converged, + bool *one_changed) { - *all_converged = true; - *one_changed = false; -#pragma omp parallel for + static_assert(is_complex_s::value == false, + "ValueType must not be complex in this function!"); + bool local_one_changed = false; +#pragma omp parallel for reduction(|| : local_one_changed) for (size_type i = 0; i < tau->get_size()[1]; ++i) { - if (abs(tau->at(i)) < rel_residual_goal * abs(orig_tau->at(i))) { + if (tau->at(i) < rel_residual_goal * orig_tau->at(i)) { stop_status->get_data()[i].converge(stoppingId, setFinalized); - *one_changed = true; + local_one_changed = true; } } - // No early stopping here because one cannot use break with omp parallel - // for But it's parallel so does it matter? -#pragma omp parallel for + *one_changed = local_one_changed; + // No early stopping here because one cannot use break with parallel for + // But it's parallel so does it matter? + bool local_all_converged = true; +#pragma omp parallel for reduction(&& : local_all_converged) for (size_type i = 0; i < stop_status->get_num_elems(); ++i) { if (!stop_status->get_const_data()[i].has_stopped()) { - *all_converged = false; + local_all_converged = false; } } + *all_converged = local_all_converged; } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( + GKO_DECLARE_RESIDUAL_NORM_KERNEL); -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace omp } // namespace kernels } // namespace gko diff --git a/omp/test/CMakeLists.txt b/omp/test/CMakeLists.txt index cd7e0fdba99..d746413f53f 100644 --- a/omp/test/CMakeLists.txt +++ b/omp/test/CMakeLists.txt @@ -1,3 +1,6 @@ +include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake) + +add_subdirectory(components) add_subdirectory(factorization) add_subdirectory(matrix) add_subdirectory(preconditioner) diff --git a/omp/test/components/CMakeLists.txt b/omp/test/components/CMakeLists.txt new file mode 100644 index 00000000000..9c1dca5bcfa --- /dev/null +++ b/omp/test/components/CMakeLists.txt @@ -0,0 +1,3 @@ +ginkgo_create_test(fill_array) +ginkgo_create_test(precision_conversion) +ginkgo_create_test(prefix_sum) diff --git a/omp/test/components/fill_array.cpp b/omp/test/components/fill_array.cpp new file mode 100644 index 00000000000..ad657e7e6e2 --- /dev/null +++ b/omp/test/components/fill_array.cpp @@ -0,0 +1,86 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/fill_array.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class FillArray : public ::testing::Test { +protected: + using value_type = T; + FillArray() + : ref(gko::ReferenceExecutor::create()), + exec(gko::OmpExecutor::create()), + total_size(63531), + vals(ref, total_size), + dvals(exec, total_size) + { + std::fill_n(vals.get_data(), total_size, T(1523)); + } + + std::shared_ptr ref; + std::shared_ptr exec; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + +TYPED_TEST_CASE(FillArray, gko::test::ValueAndIndexTypes); + + +TYPED_TEST(FillArray, EqualsReference) +{ + using T = typename TestFixture::value_type; + gko::kernels::omp::components::fill_array( + this->exec, this->dvals.get_data(), this->total_size, T(1523)); + GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals); +} + + +} // namespace diff --git a/omp/test/components/precision_conversion.cpp b/omp/test/components/precision_conversion.cpp new file mode 100644 index 00000000000..ffd9c25df3e --- /dev/null +++ b/omp/test/components/precision_conversion.cpp @@ -0,0 +1,173 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +class PrecisionConversion : public ::testing::Test { +protected: + PrecisionConversion() + : ref(gko::ReferenceExecutor::create()), + exec(gko::OmpExecutor::create()), + rand(293), + total_size(42793), + vals(ref, total_size), + cvals(ref, total_size), + vals2(ref, 1), + expected_float(ref, 1), + expected_double(ref, 1), + dvals(exec), + dcvals(exec), + dvals2(exec) + { + auto maxval = 1e10f; + std::uniform_real_distribution dist(-maxval, maxval); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + cvals.get_data()[i] = {dist(rand), dist(rand)}; + } + dvals = vals; + dcvals = cvals; + gko::uint64 rawdouble{0x4218888000889111ULL}; + gko::uint32 rawfloat{0x50c44400UL}; + gko::uint64 rawrounded{0x4218888000000000ULL}; + std::memcpy(vals2.get_data(), &rawdouble, sizeof(double)); + std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float)); + std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double)); + dvals2 = vals2; + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; + gko::Array vals2; + gko::Array dvals2; + gko::Array expected_float; + gko::Array expected_double; + gko::Array> cvals; + gko::Array> dcvals; +}; + + +TEST_F(PrecisionConversion, ConvertsReal) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = dvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsRealViaRef) +{ + gko::Array tmp{ref}; + gko::Array dout; + + tmp = dvals; + dout = tmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsComplex) +{ + gko::Array> dtmp; + gko::Array> dout; + + dtmp = dcvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dcvals, dout); +} + + +TEST_F(PrecisionConversion, ConversionRounds) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = dvals2; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dtmp, expected_float); + GKO_ASSERT_ARRAY_EQ(dout, expected_double); +} + + +TEST_F(PrecisionConversion, ConvertsRealFromRef) +{ + gko::Array dtmp; + gko::Array dout; + + dtmp = vals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dvals, dout); +} + + +TEST_F(PrecisionConversion, ConvertsComplexFromRef) +{ + gko::Array> dtmp; + gko::Array> dout; + + dtmp = cvals; + dout = dtmp; + + GKO_ASSERT_ARRAY_EQ(dcvals, dout); +} + + +} // namespace diff --git a/omp/test/components/prefix_sum.cpp b/omp/test/components/prefix_sum.cpp new file mode 100644 index 00000000000..277667b7801 --- /dev/null +++ b/omp/test/components/prefix_sum.cpp @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class PrefixSum : public ::testing::Test { +protected: + using index_type = T; + PrefixSum() + : ref(gko::ReferenceExecutor::create()), + exec(gko::OmpExecutor::create()), + rand(293), + total_size(42793), + vals(ref, total_size), + dvals(exec) + { + std::uniform_int_distribution dist(0, 1000); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + } + dvals = vals; + } + + void test(gko::size_type size) + { + gko::kernels::reference::components::prefix_sum(ref, vals.get_data(), + size); + gko::kernels::omp::components::prefix_sum(exec, dvals.get_data(), size); + + GKO_ASSERT_ARRAY_EQ(vals, dvals); + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + +TYPED_TEST_CASE(PrefixSum, gko::test::IndexTypes); + + +TYPED_TEST(PrefixSum, SmallEqualsReference) { this->test(100); } + + +TYPED_TEST(PrefixSum, BigEqualsReference) { this->test(this->total_size); } + + +} // namespace diff --git a/omp/test/factorization/CMakeLists.txt b/omp/test/factorization/CMakeLists.txt index 36c21b93eea..b52c2d938d7 100644 --- a/omp/test/factorization/CMakeLists.txt +++ b/omp/test/factorization/CMakeLists.txt @@ -1 +1,3 @@ +ginkgo_create_test(par_ict_kernels) ginkgo_create_test(par_ilu_kernels) +ginkgo_create_test(par_ilut_kernels) diff --git a/omp/test/factorization/par_ict_kernels.cpp b/omp/test/factorization/par_ict_kernels.cpp new file mode 100644 index 00000000000..95ab07b4030 --- /dev/null +++ b/omp/test/factorization/par_ict_kernels.cpp @@ -0,0 +1,193 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +template +class ParIct : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + + ParIct() + : mtx_size(532, 532), + rand_engine(567321), + ref(gko::ReferenceExecutor::create()), + omp(gko::OmpExecutor::create()) + { + mtx = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution(10, mtx_size[1]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + mtx_l = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution(10, mtx_size[0]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + + dmtx_ani = Csr::create(omp); + dmtx_l_ani = Csr::create(omp); + dmtx = Csr::create(omp); + dmtx->copy_from(lend(mtx)); + dmtx_l = Csr::create(omp); + dmtx_l->copy_from(lend(mtx_l)); + } + + void SetUp() + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + mtx_ani = gko::read(input_file, ref); + mtx_ani->sort_by_column_index(); + + { + mtx_l_ani = Csr::create(ref, mtx_ani->get_size()); + gko::matrix::CsrBuilder l_builder( + lend(mtx_l_ani)); + gko::kernels::reference::factorization::initialize_row_ptrs_l( + ref, lend(mtx_ani), mtx_l_ani->get_row_ptrs()); + auto l_nnz = + mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + gko::kernels::reference::factorization::initialize_l( + ref, lend(mtx_ani), lend(mtx_l_ani), true); + } + dmtx_ani->copy_from(lend(mtx_ani)); + dmtx_l_ani->copy_from(lend(mtx_l_ani)); + } + + std::shared_ptr ref; + std::shared_ptr omp; + + const gko::dim<2> mtx_size; + std::default_random_engine rand_engine; + + std::unique_ptr mtx_ani; + std::unique_ptr mtx_l_ani; + std::unique_ptr mtx; + std::unique_ptr mtx_l; + + std::unique_ptr dmtx_ani; + std::unique_ptr dmtx_l_ani; + std::unique_ptr dmtx; + std::unique_ptr dmtx_l; +}; + +TYPED_TEST_CASE(ParIct, gko::test::ValueIndexTypes); + + +TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto mtx_llt = Csr::create(this->ref, this->mtx_size); + this->mtx_l->apply(lend(this->mtx_l->transpose()), lend(mtx_llt)); + auto dmtx_llt = Csr::create(this->omp, this->mtx_size); + dmtx_llt->copy_from(lend(mtx_llt)); + auto res_mtx_l = Csr::create(this->ref, this->mtx_size); + auto dres_mtx_l = Csr::create(this->omp, this->mtx_size); + + gko::kernels::reference::par_ict_factorization::add_candidates( + this->ref, lend(mtx_llt), lend(this->mtx), lend(this->mtx_l), + lend(res_mtx_l)); + gko::kernels::omp::par_ict_factorization::add_candidates( + this->omp, lend(dmtx_llt), lend(this->dmtx), lend(this->dmtx_l), + lend(dres_mtx_l)); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l); + GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, r::value); +} + + +TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + auto square_size = this->mtx_ani->get_size(); + auto mtx_l_coo = Coo::create(this->ref, square_size); + this->mtx_l_ani->convert_to(lend(mtx_l_coo)); + auto dmtx_l_coo = Coo::create(this->omp, square_size); + dmtx_l_coo->copy_from(lend(mtx_l_coo)); + + gko::kernels::reference::par_ict_factorization::compute_factor( + this->ref, lend(this->mtx_ani), lend(this->mtx_l_ani), lend(mtx_l_coo)); + for (int i = 0; i < 20; ++i) { + gko::kernels::omp::par_ict_factorization::compute_factor( + this->omp, lend(this->dmtx_ani), lend(this->dmtx_l_ani), + lend(dmtx_l_coo)); + } + + GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2); +} + + +} // namespace diff --git a/omp/test/factorization/par_ilu_kernels.cpp b/omp/test/factorization/par_ilu_kernels.cpp index 46f8a3e22fb..41ff692b702 100644 --- a/omp/test/factorization/par_ilu_kernels.cpp +++ b/omp/test/factorization/par_ilu_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -44,11 +45,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include +#include "core/factorization/factorization_kernels.hpp" #include "core/test/utils.hpp" #include "matrices/config.hpp" @@ -56,16 +59,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace { +template class ParIlu : public ::testing::Test { protected: - using value_type = gko::default_precision; - using index_type = gko::int32; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; using Dense = gko::matrix::Dense; using Coo = gko::matrix::Coo; using Csr = gko::matrix::Csr; + std::ranlux48 rand_engine; + std::shared_ptr ref; + std::shared_ptr omp; + std::shared_ptr csr_ref; + std::shared_ptr csr_omp; + ParIlu() - : ref(gko::ReferenceExecutor::create()), + : rand_engine(17), + ref(gko::ReferenceExecutor::create()), omp(gko::OmpExecutor::create()), csr_ref(nullptr), csr_omp(nullptr) @@ -79,25 +92,62 @@ class ParIlu : public ::testing::Test { FAIL() << "Could not find the file \"" << file_name << "\", which is required for this test.\n"; } - csr_ref = gko::read(input_file, ref); + auto csr_ref_temp = gko::read(input_file, ref); auto csr_omp_temp = Csr::create(omp); - csr_omp_temp->copy_from(gko::lend(csr_ref)); + csr_omp_temp->copy_from(gko::lend(csr_ref_temp)); + // Make sure there are diagonal elements present + gko::kernels::reference::factorization::add_diagonal_elements( + ref, gko::lend(csr_ref_temp), false); + gko::kernels::omp::factorization::add_diagonal_elements( + omp, gko::lend(csr_omp_temp), false); + csr_ref = gko::give(csr_ref_temp); csr_omp = gko::give(csr_omp_temp); } - std::shared_ptr ref; - std::shared_ptr omp; - std::shared_ptr csr_ref; - std::shared_ptr csr_omp; + template + std::unique_ptr gen_mtx(index_type num_rows, index_type num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution(0, num_cols - 1), + std::normal_distribution>(0.0, 1.0), + rand_engine, ref); + } + + std::unique_ptr gen_unsorted_mtx(index_type num_rows, + index_type num_cols) + { + using std::swap; + auto mtx = gen_mtx(num_rows, num_cols); + auto values = mtx->get_values(); + auto col_idxs = mtx->get_col_idxs(); + const auto row_ptrs = mtx->get_const_row_ptrs(); + for (int row = 0; row < num_rows; ++row) { + const auto row_start = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const int num_row_elements = row_end - row_start; + auto idx_dist = std::uniform_int_distribution( + row_start, row_end - 1); + for (int i = 0; i < num_row_elements / 2; ++i) { + auto idx1 = idx_dist(rand_engine); + auto idx2 = idx_dist(rand_engine); + if (idx1 != idx2) { + swap(values[idx1], values[idx2]); + swap(col_idxs[idx1], col_idxs[idx2]); + } + } + } + return mtx; + } void initialize_row_ptrs(index_type *l_row_ptrs_ref, index_type *u_row_ptrs_ref, index_type *l_row_ptrs_omp, index_type *u_row_ptrs_omp) { - gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u( + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( ref, gko::lend(csr_ref), l_row_ptrs_ref, u_row_ptrs_ref); - gko::kernels::omp::par_ilu_factorization::initialize_row_ptrs_l_u( + gko::kernels::omp::factorization::initialize_row_ptrs_l_u( omp, gko::lend(csr_omp), l_row_ptrs_omp, u_row_ptrs_omp); } @@ -123,18 +173,18 @@ class ParIlu : public ::testing::Test { *l_omp = Csr::create(omp, csr_omp->get_size(), l_nnz); *u_omp = Csr::create(omp, csr_omp->get_size(), u_nnz); // Copy the already initialized `row_ptrs` to the new matrices - ref->copy_from(gko::lend(ref), num_row_ptrs, l_row_ptrs_ref.get_data(), - (*l_ref)->get_row_ptrs()); - ref->copy_from(gko::lend(ref), num_row_ptrs, u_row_ptrs_ref.get_data(), - (*u_ref)->get_row_ptrs()); - omp->copy_from(gko::lend(omp), num_row_ptrs, l_row_ptrs_omp.get_data(), - (*l_omp)->get_row_ptrs()); - omp->copy_from(gko::lend(omp), num_row_ptrs, u_row_ptrs_omp.get_data(), - (*u_omp)->get_row_ptrs()); - - gko::kernels::reference::par_ilu_factorization::initialize_l_u( + ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(), + (*l_ref)->get_row_ptrs()); + ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(), + (*u_ref)->get_row_ptrs()); + omp->copy(num_row_ptrs, l_row_ptrs_omp.get_data(), + (*l_omp)->get_row_ptrs()); + omp->copy(num_row_ptrs, u_row_ptrs_omp.get_data(), + (*u_omp)->get_row_ptrs()); + + gko::kernels::reference::factorization::initialize_l_u( ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref)); - gko::kernels::omp::par_ilu_factorization::initialize_l_u( + gko::kernels::omp::factorization::initialize_l_u( omp, gko::lend(csr_omp), gko::lend(*l_omp), gko::lend(*u_omp)); } @@ -174,21 +224,87 @@ class ParIlu : public ::testing::Test { } }; +TYPED_TEST_CASE(ParIlu, gko::test::ValueIndexTypes); + + +TYPED_TEST(ParIlu, OmpKernelAddDiagonalElementsSortedEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using Csr = typename TestFixture::Csr; + index_type num_rows{200}; + index_type num_cols{200}; + auto mtx_ref = this->template gen_mtx(num_rows, num_cols); + auto mtx_omp = Csr::create(this->omp); + mtx_omp->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(mtx_ref), true); + gko::kernels::omp::factorization::add_diagonal_elements( + this->omp, gko::lend(mtx_omp), true); + + ASSERT_TRUE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_omp, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_omp); +} + + +TYPED_TEST(ParIlu, OmpKernelAddDiagonalElementsUnsortedEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using Csr = typename TestFixture::Csr; + index_type num_rows{200}; + index_type num_cols{200}; + auto mtx_ref = this->gen_unsorted_mtx(num_rows, num_cols); + auto mtx_omp = Csr::create(this->omp); + mtx_omp->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(mtx_ref), false); + gko::kernels::omp::factorization::add_diagonal_elements( + this->omp, gko::lend(mtx_omp), false); + + ASSERT_FALSE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_omp, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_omp); +} + + +TYPED_TEST(ParIlu, OmpKernelAddDiagonalElementsNonSquareEquivalentToRef) +{ + using index_type = typename TestFixture::index_type; + using Csr = typename TestFixture::Csr; + index_type num_rows{200}; + index_type num_cols{100}; + auto mtx_ref = this->template gen_mtx(num_rows, num_cols); + auto mtx_omp = Csr::create(this->omp); + mtx_omp->copy_from(gko::lend(mtx_ref)); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(mtx_ref), true); + gko::kernels::omp::factorization::add_diagonal_elements( + this->omp, gko::lend(mtx_omp), true); + + ASSERT_TRUE(mtx_ref->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(mtx_ref, mtx_omp, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(mtx_ref, mtx_omp); +} + -TEST_F(ParIlu, OmpKernelInitializeRowPtrsLUEquivalentToRef) +TYPED_TEST(ParIlu, OmpKernelInitializeRowPtrsLUEquivalentToRef) { - auto num_row_ptrs = csr_ref->get_size()[0] + 1; - gko::Array l_row_ptrs_array_ref(ref, num_row_ptrs); - gko::Array u_row_ptrs_array_ref(ref, num_row_ptrs); - gko::Array l_row_ptrs_array_omp(omp, num_row_ptrs); - gko::Array u_row_ptrs_array_omp(omp, num_row_ptrs); + using index_type = typename TestFixture::index_type; + auto num_row_ptrs = this->csr_ref->get_size()[0] + 1; + gko::Array l_row_ptrs_array_ref(this->ref, num_row_ptrs); + gko::Array u_row_ptrs_array_ref(this->ref, num_row_ptrs); + gko::Array l_row_ptrs_array_omp(this->omp, num_row_ptrs); + gko::Array u_row_ptrs_array_omp(this->omp, num_row_ptrs); auto l_row_ptrs_ref = l_row_ptrs_array_ref.get_data(); auto u_row_ptrs_ref = u_row_ptrs_array_ref.get_data(); auto l_row_ptrs_omp = l_row_ptrs_array_omp.get_data(); auto u_row_ptrs_omp = u_row_ptrs_array_omp.get_data(); - initialize_row_ptrs(l_row_ptrs_ref, u_row_ptrs_ref, l_row_ptrs_omp, - u_row_ptrs_omp); + this->initialize_row_ptrs(l_row_ptrs_ref, u_row_ptrs_ref, l_row_ptrs_omp, + u_row_ptrs_omp); ASSERT_TRUE(std::equal(l_row_ptrs_ref, l_row_ptrs_ref + num_row_ptrs, l_row_ptrs_omp)); @@ -197,46 +313,57 @@ TEST_F(ParIlu, OmpKernelInitializeRowPtrsLUEquivalentToRef) } -TEST_F(ParIlu, KernelInitializeParILUIsEquivalentToRef) +TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef) { + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; std::unique_ptr l_ref{}; std::unique_ptr u_ref{}; std::unique_ptr l_omp{}; std::unique_ptr u_omp{}; - initialize_lu(&l_ref, &u_ref, &l_omp, &u_omp); + this->initialize_lu(&l_ref, &u_ref, &l_omp, &u_omp); - GKO_ASSERT_MTX_NEAR(l_ref, l_omp, 1e-14); - GKO_ASSERT_MTX_NEAR(u_ref, u_omp, 1e-14); + GKO_ASSERT_MTX_NEAR(l_ref, l_omp, r::value); + GKO_ASSERT_MTX_NEAR(u_ref, u_omp, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_omp); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_omp); } -TEST_F(ParIlu, KernelComputeParILUIsEquivalentToRef) +TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { + using Csr = typename TestFixture::Csr; std::unique_ptr l_ref{}; std::unique_ptr u_ref{}; std::unique_ptr l_omp{}; std::unique_ptr u_omp{}; - compute_lu(&l_ref, &u_ref, &l_omp, &u_omp); + this->compute_lu(&l_ref, &u_ref, &l_omp, &u_omp); GKO_ASSERT_MTX_NEAR(l_ref, l_omp, 5e-2); GKO_ASSERT_MTX_NEAR(u_ref, u_omp, 5e-2); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_omp); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_omp); } -TEST_F(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) +TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; std::unique_ptr l_ref{}; std::unique_ptr u_ref{}; std::unique_ptr l_omp{}; std::unique_ptr u_omp{}; - gko::size_type iterations{20}; + gko::size_type iterations{30}; - compute_lu(&l_ref, &u_ref, &l_omp, &u_omp, iterations); + this->compute_lu(&l_ref, &u_ref, &l_omp, &u_omp, iterations); - GKO_ASSERT_MTX_NEAR(l_ref, l_omp, 1e-14); - GKO_ASSERT_MTX_NEAR(u_ref, u_omp, 1e-14); + GKO_ASSERT_MTX_NEAR(l_ref, l_omp, r::value); + GKO_ASSERT_MTX_NEAR(u_ref, u_omp, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_omp); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_omp); } diff --git a/omp/test/factorization/par_ilut_kernels.cpp b/omp/test/factorization/par_ilut_kernels.cpp new file mode 100644 index 00000000000..7af808a258b --- /dev/null +++ b/omp/test/factorization/par_ilut_kernels.cpp @@ -0,0 +1,467 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +template +class ParIlut : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Dense = gko::matrix::Dense; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + + ParIlut() + : mtx_size(532, 423), + rand_engine(1337), + ref(gko::ReferenceExecutor::create()), + omp(gko::OmpExecutor::create()) + { + mtx1 = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution(10, mtx_size[1]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + mtx2 = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[1], + std::uniform_int_distribution(0, mtx_size[1]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + mtx_square = gko::test::generate_random_matrix( + mtx_size[0], mtx_size[0], + std::uniform_int_distribution(1, mtx_size[0]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + mtx_l = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution(10, mtx_size[0]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + mtx_l2 = gko::test::generate_random_lower_triangular_matrix( + mtx_size[0], mtx_size[0], true, + std::uniform_int_distribution(1, mtx_size[0]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + mtx_u = gko::test::generate_random_upper_triangular_matrix( + mtx_size[0], mtx_size[0], false, + std::uniform_int_distribution(10, mtx_size[0]), + std::normal_distribution>(-1.0, + 1.0), + rand_engine, ref); + + dmtx1 = Csr::create(omp); + dmtx1->copy_from(mtx1.get()); + dmtx2 = Csr::create(omp); + dmtx2->copy_from(mtx2.get()); + dmtx_square = Csr::create(omp); + dmtx_square->copy_from(mtx_square.get()); + dmtx_ani = Csr::create(omp); + dmtx_l_ani = Csr::create(omp); + dmtx_u_ani = Csr::create(omp); + dmtx_ut_ani = Csr::create(omp); + dmtx_l = Csr::create(omp); + dmtx_l->copy_from(mtx_l.get()); + dmtx_l2 = Csr::create(omp); + dmtx_l2->copy_from(mtx_l2.get()); + dmtx_u = Csr::create(omp); + dmtx_u->copy_from(mtx_u.get()); + } + + void SetUp() + { + std::string file_name(gko::matrices::location_ani4_mtx); + auto input_file = std::ifstream(file_name, std::ios::in); + if (!input_file) { + FAIL() << "Could not find the file \"" << file_name + << "\", which is required for this test.\n"; + } + mtx_ani = gko::read(input_file, ref); + mtx_ani->sort_by_column_index(); + + { + mtx_l_ani = Csr::create(ref, mtx_ani->get_size()); + mtx_u_ani = Csr::create(ref, mtx_ani->get_size()); + gko::matrix::CsrBuilder l_builder( + mtx_l_ani.get()); + gko::matrix::CsrBuilder u_builder( + mtx_u_ani.get()); + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( + ref, mtx_ani.get(), mtx_l_ani->get_row_ptrs(), + mtx_u_ani->get_row_ptrs()); + auto l_nnz = + mtx_l_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + auto u_nnz = + mtx_u_ani->get_const_row_ptrs()[mtx_ani->get_size()[0]]; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + u_builder.get_col_idx_array().resize_and_reset(u_nnz); + u_builder.get_value_array().resize_and_reset(u_nnz); + gko::kernels::reference::factorization::initialize_l_u( + ref, mtx_ani.get(), mtx_l_ani.get(), mtx_u_ani.get()); + mtx_ut_ani = Csr::create(ref, mtx_ani->get_size(), + mtx_u_ani->get_num_stored_elements()); + gko::kernels::reference::csr::transpose(ref, mtx_u_ani.get(), + mtx_ut_ani.get()); + } + dmtx_ani->copy_from(mtx_ani.get()); + dmtx_l_ani->copy_from(mtx_l_ani.get()); + dmtx_u_ani->copy_from(mtx_u_ani.get()); + dmtx_ut_ani->copy_from(mtx_ut_ani.get()); + } + + void test_select(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, index_type rank, + gko::remove_complex tolerance = 0.0) + { + auto size = index_type(mtx->get_num_stored_elements()); + + gko::remove_complex res{}; + gko::remove_complex dres{}; + gko::Array tmp(ref); + gko::Array> tmp2(ref); + gko::Array dtmp(omp); + gko::Array> dtmp2(omp); + + gko::kernels::reference::par_ilut_factorization::threshold_select( + ref, mtx.get(), rank, tmp, tmp2, res); + gko::kernels::omp::par_ilut_factorization::threshold_select( + omp, dmtx.get(), rank, dtmp, dtmp2, dres); + + ASSERT_EQ(res, dres); + } + + void test_filter(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, + gko::remove_complex threshold, bool lower) + { + auto res = Csr::create(ref, mtx_size); + auto dres = Csr::create(omp, mtx_size); + auto res_coo = Coo::create(ref, mtx_size); + auto dres_coo = Coo::create(omp, mtx_size); + auto local_mtx = gko::as(lower ? mtx->clone() : mtx->transpose()); + auto local_dmtx = + gko::as(lower ? dmtx->clone() : dmtx->transpose()); + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower); + gko::kernels::omp::par_ilut_factorization::threshold_filter( + omp, local_dmtx.get(), threshold, dres.get(), dres_coo.get(), + lower); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + GKO_ASSERT_MTX_NEAR(res, res_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo); + GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo); + } + + void test_filter_approx(const std::unique_ptr &mtx, + const std::unique_ptr &dmtx, index_type rank) + { + auto res = Csr::create(ref, mtx_size); + auto dres = Csr::create(omp, mtx_size); + auto res_coo = Coo::create(ref, mtx_size); + auto dres_coo = Coo::create(omp, mtx_size); + + gko::Array tmp(ref); + gko::Array dtmp(omp); + gko::remove_complex threshold{}; + gko::remove_complex dthreshold{}; + + gko::kernels::reference::par_ilut_factorization:: + threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold, + res.get(), res_coo.get()); + gko::kernels::omp::par_ilut_factorization::threshold_filter_approx( + omp, dmtx.get(), rank, dtmp, dthreshold, dres.get(), + dres_coo.get()); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + GKO_ASSERT_MTX_NEAR(res, res_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, res_coo); + GKO_ASSERT_MTX_NEAR(dres, dres_coo, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(dres, dres_coo); + ASSERT_EQ(threshold, dthreshold); + } + + std::shared_ptr ref; + std::shared_ptr omp; + + const gko::dim<2> mtx_size; + std::default_random_engine rand_engine; + + std::unique_ptr mtx1; + std::unique_ptr mtx2; + std::unique_ptr mtx_square; + std::unique_ptr mtx_ani; + std::unique_ptr mtx_l_ani; + std::unique_ptr mtx_u_ani; + std::unique_ptr mtx_ut_ani; + std::unique_ptr mtx_l; + std::unique_ptr mtx_l2; + std::unique_ptr mtx_u; + + std::unique_ptr dmtx1; + std::unique_ptr dmtx2; + std::unique_ptr dmtx_square; + std::unique_ptr dmtx_ani; + std::unique_ptr dmtx_l_ani; + std::unique_ptr dmtx_u_ani; + std::unique_ptr dmtx_ut_ani; + std::unique_ptr dmtx_l; + std::unique_ptr dmtx_l2; + std::unique_ptr dmtx_u; +}; + +TYPED_TEST_CASE(ParIlut, gko::test::ValueIndexTypes); + + +TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) +{ + this->test_select(this->mtx_l, this->dmtx_l, + this->mtx_l->get_num_stored_elements() / 3); +} + + +TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) +{ + this->test_select(this->mtx_l, this->dmtx_l, 0); +} + + +TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef) +{ + this->test_select(this->mtx_l, this->dmtx_l, + this->mtx_l->get_num_stored_elements() - 1); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + auto res = Csr::create(this->ref, this->mtx_size); + auto dres = Csr::create(this->omp, this->mtx_size); + Coo *null_coo = nullptr; + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + this->ref, this->mtx_l.get(), 0.5, res.get(), null_coo, true); + gko::kernels::omp::par_ilut_factorization::threshold_filter( + this->omp, this->dmtx_l.get(), 0.5, dres.get(), null_coo, true); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterLowerIsEquivalentToRef) +{ + this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterUpperIsEquivalentToRef) +{ + this->test_filter(this->mtx_l, this->dmtx_l, 0.5, false); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNoneLowerIsEquivalentToRef) +{ + this->test_filter(this->mtx_l, this->dmtx_l, 0, true); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNoneUpperIsEquivalentToRef) +{ + this->test_filter(this->mtx_l, this->dmtx_l, 0, false); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterAllLowerIsEquivalentToRef) +{ + this->test_filter(this->mtx_l, this->dmtx_l, 1e6, true); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterAllUpperIsEquivalentToRef) +{ + this->test_filter(this->mtx_l, this->dmtx_l, 1e6, false); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true); + auto res = Csr::create(this->ref, this->mtx_size); + auto dres = Csr::create(this->omp, this->mtx_size); + Coo *null_coo = nullptr; + gko::Array tmp(this->ref); + gko::Array dtmp(this->omp); + gko::remove_complex threshold{}; + gko::remove_complex dthreshold{}; + index_type rank{}; + + gko::kernels::reference::par_ilut_factorization::threshold_filter_approx( + this->ref, this->mtx_l.get(), rank, tmp, threshold, res.get(), + null_coo); + gko::kernels::omp::par_ilut_factorization::threshold_filter_approx( + this->omp, this->dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), + null_coo); + + GKO_ASSERT_MTX_NEAR(res, dres, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); + ASSERT_EQ(threshold, dthreshold); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) +{ + this->test_filter_approx(this->mtx_l, this->dmtx_l, + this->mtx_l->get_num_stored_elements() / 2); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) +{ + this->test_filter_approx(this->mtx_l, this->dmtx_l, 0); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef) +{ + this->test_filter_approx(this->mtx_l, this->dmtx_l, + this->mtx_l->get_num_stored_elements() - 1); +} + + +TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto square_size = this->mtx_square->get_size(); + auto mtx_lu = Csr::create(this->ref, square_size); + this->mtx_l2->apply(this->mtx_u.get(), mtx_lu.get()); + auto dmtx_lu = Csr::create(this->omp, square_size); + dmtx_lu->copy_from(mtx_lu.get()); + auto res_mtx_l = Csr::create(this->ref, square_size); + auto res_mtx_u = Csr::create(this->ref, square_size); + auto dres_mtx_l = Csr::create(this->omp, square_size); + auto dres_mtx_u = Csr::create(this->omp, square_size); + + gko::kernels::reference::par_ilut_factorization::add_candidates( + this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(), + this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get()); + gko::kernels::omp::par_ilut_factorization::add_candidates( + this->omp, dmtx_lu.get(), this->dmtx_square.get(), this->dmtx_l2.get(), + this->dmtx_u.get(), dres_mtx_l.get(), dres_mtx_u.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, dres_mtx_l); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, dres_mtx_u); + GKO_ASSERT_MTX_NEAR(res_mtx_l, dres_mtx_l, r::value); + GKO_ASSERT_MTX_NEAR(res_mtx_u, dres_mtx_u, r::value); +} + + +TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + auto square_size = this->mtx_ani->get_size(); + auto mtx_l_coo = Coo::create(this->ref, square_size); + auto mtx_u_coo = Coo::create(this->ref, square_size); + this->mtx_l_ani->convert_to(mtx_l_coo.get()); + this->mtx_u_ani->convert_to(mtx_u_coo.get()); + auto dmtx_l_coo = Coo::create(this->omp, square_size); + auto dmtx_u_coo = Coo::create(this->omp, square_size); + dmtx_l_coo->copy_from(mtx_l_coo.get()); + dmtx_u_coo->copy_from(mtx_u_coo.get()); + + gko::kernels::reference::par_ilut_factorization::compute_l_u_factors( + this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get(), + this->mtx_u_ani.get(), mtx_u_coo.get(), this->mtx_ut_ani.get()); + for (int i = 0; i < 20; ++i) { + gko::kernels::omp::par_ilut_factorization::compute_l_u_factors( + this->omp, this->dmtx_ani.get(), this->dmtx_l_ani.get(), + dmtx_l_coo.get(), this->dmtx_u_ani.get(), dmtx_u_coo.get(), + this->dmtx_ut_ani.get()); + } + auto dmtx_utt_ani = gko::as(this->dmtx_ut_ani->transpose()); + + GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2); + GKO_ASSERT_MTX_NEAR(this->mtx_u_ani, this->dmtx_u_ani, 1e-2); + GKO_ASSERT_MTX_NEAR(this->dmtx_u_ani, dmtx_utt_ani, 0); +} + + +} // namespace diff --git a/omp/test/matrix/coo_kernels.cpp b/omp/test/matrix/coo_kernels.cpp index 355f31fcfa4..2bdf0361faa 100644 --- a/omp/test/matrix/coo_kernels.cpp +++ b/omp/test/matrix/coo_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/coo_kernels.hpp" +#include #include @@ -42,11 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include +#include "core/matrix/coo_kernels.hpp" #include "core/test/utils.hpp" diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index 72fa988ff5e..bb607efd615 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,13 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/csr_kernels.hpp" - +#include -#include +#include +#include #include -#include +#include #include @@ -45,11 +45,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include +#include #include +#include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" @@ -58,6 +59,7 @@ namespace { class Csr : public ::testing::Test { protected: + using Arr = gko::Array; using Mtx = gko::matrix::Csr<>; using Vec = gko::matrix::Dense<>; using ComplexVec = gko::matrix::Dense>; @@ -95,6 +97,8 @@ class Csr : public ::testing::Test { complex_mtx = ComplexMtx::create(ref); complex_mtx->copy_from( gen_mtx(mtx_size[0], mtx_size[1], 1)); + square_mtx = Mtx::create(ref); + square_mtx->copy_from(gen_mtx(mtx_size[0], mtx_size[0], 1)); expected = gen_mtx(mtx_size[0], num_vectors, 1); y = gen_mtx(mtx_size[1], num_vectors, 1); alpha = gko::initialize({2.0}, ref); @@ -103,6 +107,8 @@ class Csr : public ::testing::Test { dmtx->copy_from(mtx.get()); complex_dmtx = ComplexMtx::create(omp); complex_dmtx->copy_from(complex_mtx.get()); + square_dmtx = Mtx::create(omp); + square_dmtx->copy_from(square_mtx.get()); dresult = Vec::create(omp); dresult->copy_from(expected.get()); dy = Vec::create(omp); @@ -111,6 +117,22 @@ class Csr : public ::testing::Test { dalpha->copy_from(alpha.get()); dbeta = Vec::create(omp); dbeta->copy_from(beta.get()); + + std::vector tmp(mtx->get_size()[0], 0); + auto rng = std::default_random_engine{}; + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rng); + std::vector tmp2(mtx->get_size()[1], 0); + std::iota(tmp2.begin(), tmp2.end(), 0); + std::shuffle(tmp2.begin(), tmp2.end(), rng); + rpermute_idxs = + std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + drpermute_idxs = + std::unique_ptr(new Arr{omp, tmp.begin(), tmp.end()}); + cpermute_idxs = + std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); + dcpermute_idxs = + std::unique_ptr(new Arr{omp, tmp2.begin(), tmp2.end()}); } struct matrix_pair { @@ -120,7 +142,7 @@ class Csr : public ::testing::Test { matrix_pair gen_unsorted_mtx() { - constexpr int min_nnz_per_row = 2; // Must be larger/equal than 2 + constexpr int min_nnz_per_row = 2; // Must be at least 2 auto local_mtx_ref = gen_mtx(mtx_size[0], mtx_size[1], min_nnz_per_row); for (size_t row = 0; row < mtx_size[0]; ++row) { @@ -153,6 +175,7 @@ class Csr : public ::testing::Test { std::unique_ptr mtx; std::unique_ptr complex_mtx; + std::unique_ptr square_mtx; std::unique_ptr expected; std::unique_ptr y; std::unique_ptr alpha; @@ -160,10 +183,15 @@ class Csr : public ::testing::Test { std::unique_ptr dmtx; std::unique_ptr complex_dmtx; + std::unique_ptr square_dmtx; std::unique_ptr dresult; std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; + std::unique_ptr rpermute_idxs; + std::unique_ptr drpermute_idxs; + std::unique_ptr cpermute_idxs; + std::unique_ptr dcpermute_idxs; }; @@ -200,6 +228,57 @@ TEST_F(Csr, SimpleApplyToDenseMatrixIsEquivalentToRef) } +TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + mtx->apply(alpha.get(), trans.get(), beta.get(), square_mtx.get()); + dmtx->apply(dalpha.get(), d_trans.get(), dbeta.get(), square_dmtx.get()); + + GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); + ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + auto trans = mtx->transpose(); + auto d_trans = dmtx->transpose(); + + mtx->apply(trans.get(), square_mtx.get()); + dmtx->apply(d_trans.get(), square_dmtx.get()); + + GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); + ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + auto a = gen_mtx(mtx_size[0], mtx_size[1], 0); + auto b = gen_mtx(mtx_size[0], mtx_size[1], 0); + auto da = Mtx::create(omp); + auto db = Mtx::create(omp); + da->copy_from(a.get()); + db->copy_from(b.get()); + auto id = gko::matrix::Identity::create(ref, mtx_size[1]); + auto did = gko::matrix::Identity::create(omp, mtx_size[1]); + + a->apply(alpha.get(), id.get(), beta.get(), b.get()); + da->apply(dalpha.get(), did.get(), dbeta.get(), db.get()); + + GKO_ASSERT_MTX_NEAR(b, db, 1e-14); + GKO_ASSERT_MTX_EQ_SPARSITY(b, db); + ASSERT_TRUE(db->is_sorted_by_column_index()); +} + + TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRef) { set_up_apply_data(3); @@ -324,7 +403,7 @@ TEST_F(Csr, CalculatesNonzerosPerRow) gko::kernels::omp::csr::calculate_nonzeros_per_row(omp, dmtx.get(), &drow_nnz); - GKO_ASSERT_ARRAY_EQ(&row_nnz, &drow_nnz); + GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -360,6 +439,51 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) } +TEST_F(Csr, IsRowPermutable) +{ + set_up_apply_data(); + auto r_permute = mtx->row_permute(rpermute_idxs.get()); + auto dr_permute = dmtx->row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), + static_cast(dr_permute.get()), 0); +} + + +TEST_F(Csr, IsColPermutable) +{ + set_up_apply_data(); + auto c_permute = mtx->column_permute(cpermute_idxs.get()); + auto dc_permute = dmtx->column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), + static_cast(dc_permute.get()), 0); +} + + +TEST_F(Csr, IsInverseRowPermutable) +{ + set_up_apply_data(); + auto inverse_r_permute = mtx->inverse_row_permute(rpermute_idxs.get()); + auto d_inverse_r_permute = dmtx->inverse_row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), + static_cast(d_inverse_r_permute.get()), 0); +} + + +TEST_F(Csr, IsInverseColPermutable) +{ + set_up_apply_data(); + auto inverse_c_permute = mtx->inverse_column_permute(cpermute_idxs.get()); + auto d_inverse_c_permute = + dmtx->inverse_column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), + static_cast(d_inverse_c_permute.get()), 0); +} + + TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef) { set_up_apply_data(); diff --git a/omp/test/matrix/dense_kernels.cpp b/omp/test/matrix/dense_kernels.cpp index b1d601291f1..dd0aa4fb8d6 100644 --- a/omp/test/matrix/dense_kernels.cpp +++ b/omp/test/matrix/dense_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,26 +30,29 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/dense_kernels.hpp" +#include -#include +#include +#include #include +#include #include #include +#include #include #include -#include #include #include #include #include +#include "core/matrix/dense_kernels.hpp" #include "core/test/utils.hpp" @@ -59,6 +62,8 @@ namespace { class Dense : public ::testing::Test { protected: using Mtx = gko::matrix::Dense<>; + using NormVector = gko::matrix::Dense>; + using Arr = gko::Array; using ComplexMtx = gko::matrix::Dense>; Dense() : rand_engine(15) {} @@ -135,6 +140,22 @@ class Dense : public ::testing::Test { dalpha->copy_from(alpha.get()); dbeta = Mtx::create(omp); dbeta->copy_from(beta.get()); + + std::vector tmp(x->get_size()[0], 0); + auto rng = std::default_random_engine{}; + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rng); + std::vector tmp2(x->get_size()[1], 0); + std::iota(tmp2.begin(), tmp2.end(), 0); + std::shuffle(tmp2.begin(), tmp2.end(), rng); + rpermute_idxs = + std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + drpermute_idxs = + std::unique_ptr(new Arr{omp, tmp.begin(), tmp.end()}); + cpermute_idxs = + std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); + dcpermute_idxs = + std::unique_ptr(new Arr{omp, tmp2.begin(), tmp2.end()}); } std::shared_ptr ref; @@ -154,6 +175,10 @@ class Dense : public ::testing::Test { std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; + std::unique_ptr rpermute_idxs; + std::unique_ptr drpermute_idxs; + std::unique_ptr cpermute_idxs; + std::unique_ptr dcpermute_idxs; }; @@ -250,11 +275,14 @@ TEST_F(Dense, MultipleVectorOmpComputeDotIsEquivalentToRef) TEST_F(Dense, ComputesNorm2IsEquivalentToRef) { set_up_vector_data(20); + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->omp, norm_size); - x->compute_norm2(expected.get()); - dx->compute_norm2(dresult.get()); + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14); } @@ -524,6 +552,18 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef) } +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(omp); + auto dsellp_mtx = gko::matrix::Sellp<>::create(omp); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(*dsellp_mtx->get_const_slice_sets(), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) { std::size_t ref_max_nnz_per_row = 0; @@ -582,4 +622,48 @@ TEST_F(Dense, IsConjugateTransposable) } +TEST_F(Dense, IsRowPermutable) +{ + set_up_apply_data(); + auto r_permute = x->row_permute(rpermute_idxs.get()); + auto dr_permute = dx->row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), + static_cast(dr_permute.get()), 0); +} + + +TEST_F(Dense, IsColPermutable) +{ + set_up_apply_data(); + auto c_permute = x->column_permute(cpermute_idxs.get()); + auto dc_permute = dx->column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), + static_cast(dc_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseRowPermutable) +{ + set_up_apply_data(); + auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), + static_cast(d_inverse_r_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseColPermutable) +{ + set_up_apply_data(); + auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), + static_cast(d_inverse_c_permute.get()), 0); +} + + } // namespace diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp index 98e45f9a7ae..0fbc9173b30 100644 --- a/omp/test/matrix/ell_kernels.cpp +++ b/omp/test/matrix/ell_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/ell_kernels.hpp" +#include #include @@ -39,12 +39,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include -#include + + +#include "core/matrix/ell_kernels.hpp" +#include "core/test/utils.hpp" namespace { diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp index 13bc0cf3ff0..47e809fc5bd 100644 --- a/omp/test/matrix/hybrid_kernels.cpp +++ b/omp/test/matrix/hybrid_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/hybrid_kernels.hpp" +#include #include @@ -39,12 +39,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include -#include + + +#include "core/matrix/hybrid_kernels.hpp" +#include "core/test/utils.hpp" namespace { diff --git a/omp/test/matrix/sellp_kernels.cpp b/omp/test/matrix/sellp_kernels.cpp index a9a77452978..217ce430f8c 100644 --- a/omp/test/matrix/sellp_kernels.cpp +++ b/omp/test/matrix/sellp_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,13 +39,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include +#include "core/test/utils.hpp" + + namespace { diff --git a/omp/test/matrix/sparsity_csr_kernels.cpp b/omp/test/matrix/sparsity_csr_kernels.cpp index 91852dbcb53..dea4844885a 100644 --- a/omp/test/matrix/sparsity_csr_kernels.cpp +++ b/omp/test/matrix/sparsity_csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/sparsity_csr_kernels.hpp" +#include #include @@ -44,9 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include "core/matrix/sparsity_csr_kernels.hpp" #include "core/test/utils.hpp" @@ -117,7 +117,7 @@ class SparsityCsr : public ::testing::Test { matrix_pair gen_unsorted_mtx() { - constexpr int min_nnz_per_row = 2; // Must be larger/equal than 2 + constexpr int min_nnz_per_row = 2; // Must be at least 2 auto local_mtx_ref = gen_mtx(mtx_size[0], mtx_size[1], min_nnz_per_row); for (size_t row = 0; row < mtx_size[0]; ++row) { @@ -245,10 +245,10 @@ TEST_F(SparsityCsr, RemovesDiagElementsKernelIsEquivalentToRef) dmtx->get_num_nonzeros() - num_diags); gko::kernels::reference::sparsity_csr::remove_diagonal_elements( - ref, tmp.get(), mtx->get_const_row_ptrs(), mtx->get_const_col_idxs()); + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), tmp.get()); gko::kernels::omp::sparsity_csr::remove_diagonal_elements( - omp, d_tmp.get(), dmtx->get_const_row_ptrs(), - dmtx->get_const_col_idxs()); + omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + d_tmp.get()); GKO_ASSERT_MTX_NEAR(tmp.get(), d_tmp.get(), 0.0); } diff --git a/omp/test/preconditioner/CMakeLists.txt b/omp/test/preconditioner/CMakeLists.txt index a0ca5a2e38a..575384a4c84 100644 --- a/omp/test/preconditioner/CMakeLists.txt +++ b/omp/test/preconditioner/CMakeLists.txt @@ -1 +1,2 @@ ginkgo_create_test(jacobi_kernels) +ginkgo_create_test(isai_kernels) diff --git a/omp/test/preconditioner/isai_kernels.cpp b/omp/test/preconditioner/isai_kernels.cpp new file mode 100644 index 00000000000..ea3c52755a1 --- /dev/null +++ b/omp/test/preconditioner/isai_kernels.cpp @@ -0,0 +1,324 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/preconditioner/isai_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +enum struct matrix_type { lower, upper }; +class Isai : public ::testing::Test { +protected: + using value_type = double; + using index_type = gko::int32; + using Csr = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + Isai() : rand_engine(42) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + omp = gko::OmpExecutor::create(); + } + + std::unique_ptr clone_allocations(const Csr *csr_mtx) + { + if (csr_mtx->get_executor() != ref) { + return {nullptr}; + } + const auto num_elems = csr_mtx->get_num_stored_elements(); + auto sparsity = csr_mtx->clone(); + + // values are now filled with invalid data to catch potential errors + auto begin_values = sparsity->get_values(); + auto end_values = begin_values + num_elems; + std::fill(begin_values, end_values, -gko::one()); + return sparsity; + } + + void initialize_data(matrix_type type, gko::size_type n, + gko::size_type row_limit) + { + const bool for_lower_tm = type == matrix_type::lower; + auto nz_dist = std::uniform_int_distribution(1, row_limit); + auto val_dist = std::uniform_real_distribution(-1., 1.); + mtx = Csr::create(ref); + mtx = gko::test::generate_random_triangular_matrix( + n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref, + gko::dim<2>{n, n}); + inverse = clone_allocations(mtx.get()); + + d_mtx = Csr::create(omp); + d_mtx->copy_from(mtx.get()); + d_inverse = Csr::create(omp); + d_inverse->copy_from(inverse.get()); + } + + + std::shared_ptr ref; + std::shared_ptr omp; + + std::default_random_engine rand_engine; + + std::unique_ptr mtx; + std::unique_ptr inverse; + + std::unique_ptr d_mtx; + std::unique_ptr d_inverse; +}; + + +TEST_F(Isai, OmpIsaiGenerateLinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 536, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(omp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::omp::isai::generate_tri_inverse( + omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, OmpIsaiGenerateUinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 615, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(omp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::omp::isai::generate_tri_inverse( + omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, OmpIsaiGenerateLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 554, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(omp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::omp::isai::generate_tri_inverse( + omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, OmpIsaiGenerateUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 695, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(omp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::omp::isai::generate_tri_inverse( + omp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, OmpIsaiGenerateExcessLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 518, 40); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(omp, a1); + gko::Array da2(omp, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(omp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(omp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get()); + gko::kernels::omp::isai::generate_excess_system( + omp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, OmpIsaiGenerateExcessUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 673, 51); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(omp, a1); + gko::Array da2(omp, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(omp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(omp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get()); + gko::kernels::omp::isai::generate_excess_system( + omp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, OmpIsaiScatterExcessSolutionLIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 572, 52); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(omp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = Dense::create(omp); + de_rhs->copy_from(lend(e_rhs)); + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get()); + gko::kernels::omp::isai::scatter_excess_solution( + omp, da1.get_const_data(), de_rhs.get(), d_inverse.get()); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, OmpIsaiScatterExcessSolutionUIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 702, 45); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(omp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = Dense::create(omp); + de_rhs->copy_from(lend(e_rhs)); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get()); + gko::kernels::omp::isai::scatter_excess_solution( + omp, da1.get_const_data(), de_rhs.get(), d_inverse.get()); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +} // namespace diff --git a/omp/test/preconditioner/jacobi_kernels.cpp b/omp/test/preconditioner/jacobi_kernels.cpp index 67454f8c6b1..0753d22f008 100644 --- a/omp/test/preconditioner/jacobi_kernels.cpp +++ b/omp/test/preconditioner/jacobi_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,17 +33,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include #include #include +#include "core/test/utils.hpp" + + namespace { @@ -322,6 +324,34 @@ TEST_F(Jacobi, OmpPreconditionerEquivalentToRefWithMPW) } +TEST_F(Jacobi, OmpTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, OmpConjTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + TEST_F(Jacobi, OmpApplyEquivalentToRefWithBlockSize32) { initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111); @@ -560,6 +590,37 @@ TEST_F(Jacobi, OmpPreconditionerEquivalentToRefWithAdaptivePrecision) } +TEST_F(Jacobi, OmpTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, + OmpConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + TEST_F(Jacobi, OmpApplyEquivalentToRefWithFullPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, diff --git a/omp/test/solver/CMakeLists.txt b/omp/test/solver/CMakeLists.txt index e2a017962a5..44d37b6240d 100644 --- a/omp/test/solver/CMakeLists.txt +++ b/omp/test/solver/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_test(bicg_kernels) ginkgo_create_test(bicgstab_kernels) ginkgo_create_test(cg_kernels) ginkgo_create_test(cgs_kernels) diff --git a/omp/test/solver/bicg_kernels.cpp b/omp/test/solver/bicg_kernels.cpp new file mode 100644 index 00000000000..2766f1eb910 --- /dev/null +++ b/omp/test/solver/bicg_kernels.cpp @@ -0,0 +1,340 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/bicg_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +class Bicg : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + Bicg() : rand_engine(30) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + omp = gko::OmpExecutor::create(); + } + + void TearDown() + { + if (omp != nullptr) { + ASSERT_NO_THROW(omp->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int m = 597; + int n = 43; + b = gen_mtx(m, n); + r = gen_mtx(m, n); + z = gen_mtx(m, n); + p = gen_mtx(m, n); + q = gen_mtx(m, n); + r2 = gen_mtx(m, n); + z2 = gen_mtx(m, n); + p2 = gen_mtx(m, n); + q2 = gen_mtx(m, n); + x = gen_mtx(m, n); + beta = gen_mtx(1, n); + prev_rho = gen_mtx(1, n); + rho = gen_mtx(1, n); + stop_status = std::unique_ptr>( + new gko::Array(ref, n)); + for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { + stop_status->get_data()[i].reset(); + } + + d_b = Mtx::create(omp); + d_b->copy_from(b.get()); + d_r = Mtx::create(omp); + d_r->copy_from(r.get()); + d_z = Mtx::create(omp); + d_z->copy_from(z.get()); + d_p = Mtx::create(omp); + d_p->copy_from(p.get()); + d_q = Mtx::create(omp); + d_q->copy_from(q.get()); + d_r2 = Mtx::create(omp); + d_r2->copy_from(r2.get()); + d_z2 = Mtx::create(omp); + d_z2->copy_from(z2.get()); + d_p2 = Mtx::create(omp); + d_p2->copy_from(p2.get()); + d_q2 = Mtx::create(omp); + d_q2->copy_from(q2.get()); + d_x = Mtx::create(omp); + d_x->copy_from(x.get()); + d_beta = Mtx::create(omp); + d_beta->copy_from(beta.get()); + d_prev_rho = Mtx::create(omp); + d_prev_rho->copy_from(prev_rho.get()); + d_rho = Mtx::create(omp); + d_rho->copy_from(rho.get()); + d_stop_status = std::unique_ptr>( + new gko::Array(omp, n)); + *d_stop_status = *stop_status; + } + + void make_symetric(Mtx *mtx) + { + for (int i = 0; i < mtx->get_size()[0]; ++i) { + for (int j = i + 1; j < mtx->get_size()[1]; ++j) { + mtx->at(i, j) = mtx->at(j, i); + } + } + } + + void make_diag_dominant(Mtx *mtx) + { + using std::abs; + for (int i = 0; i < mtx->get_size()[0]; ++i) { + auto sum = gko::zero(); + for (int j = 0; j < mtx->get_size()[1]; ++j) { + sum += abs(mtx->at(i, j)); + } + mtx->at(i, i) = sum; + } + } + + void make_spd(Mtx *mtx) + { + make_symetric(mtx); + make_diag_dominant(mtx); + } + + std::shared_ptr ref; + std::shared_ptr omp; + + std::ranlux48 rand_engine; + + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr z; + std::unique_ptr p; + std::unique_ptr q; + std::unique_ptr r2; + std::unique_ptr z2; + std::unique_ptr p2; + std::unique_ptr q2; + std::unique_ptr x; + std::unique_ptr beta; + std::unique_ptr prev_rho; + std::unique_ptr rho; + std::unique_ptr> stop_status; + + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_z; + std::unique_ptr d_p; + std::unique_ptr d_q; + std::unique_ptr d_r2; + std::unique_ptr d_z2; + std::unique_ptr d_p2; + std::unique_ptr d_q2; + std::unique_ptr d_x; + std::unique_ptr d_beta; + std::unique_ptr d_prev_rho; + std::unique_ptr d_rho; + std::unique_ptr> d_stop_status; +}; + + +TEST_F(Bicg, OmpBicgInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::initialize( + ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), + rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get()); + gko::kernels::omp::bicg::initialize( + omp, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), + d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(), + d_q2.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); + GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + +TEST_F(Bicg, OmpBicgStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(), + z2.get(), rho.get(), prev_rho.get(), + stop_status.get()); + gko::kernels::omp::bicg::step_1(omp, d_p.get(), d_z.get(), d_p2.get(), + d_z2.get(), d_rho.get(), d_prev_rho.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z, z, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p2, p2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_z2, z2, 1e-14); +} + + +TEST_F(Bicg, OmpBicgStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::bicg::step_2( + ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(), + rho.get(), stop_status.get()); + gko::kernels::omp::bicg::step_2( + omp, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), d_q2.get(), + d_beta.get(), d_rho.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r, r, 1e-14); + GKO_ASSERT_MTX_NEAR(d_r2, r2, 1e-14); + GKO_ASSERT_MTX_NEAR(d_p, p, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); + GKO_ASSERT_MTX_NEAR(d_q2, q2, 1e-14); +} + + +TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + make_spd(mtx.get()); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = Mtx::create(omp); + d_mtx->copy_from(mtx.get()); + auto d_x = Mtx::create(omp); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(omp); + d_b->copy_from(b.get()); + auto bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(omp), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(omp)) + .on(omp); + auto solver = bicg_factory->generate(std::move(mtx)); + auto d_solver = d_bicg_factory->generate(std::move(d_mtx)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Bicg, ApplyWithRandomMatrixIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = Mtx::create(omp); + d_mtx->copy_from(mtx.get()); + auto d_x = Mtx::create(omp); + d_x->copy_from(x.get()); + auto d_b = Mtx::create(omp); + d_b->copy_from(b.get()); + auto bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(ref), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(ref)) + .on(ref); + auto d_bicg_factory = + gko::solver::Bicg<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(50u).on(omp), + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-14) + .on(omp)) + .on(omp); + auto solver = bicg_factory->generate(std::move(mtx)); + auto d_solver = d_bicg_factory->generate(std::move(d_mtx)); + + solver->apply(b.get(), x.get()); + d_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +} // namespace diff --git a/omp/test/solver/bicgstab_kernels.cpp b/omp/test/solver/bicgstab_kernels.cpp index 7b5d96bccb4..5a81532f787 100644 --- a/omp/test/solver/bicgstab_kernels.cpp +++ b/omp/test/solver/bicgstab_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,21 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include #include -#include +#include + + +#include "core/solver/bicgstab_kernels.hpp" +#include "core/test/utils.hpp" namespace { @@ -256,7 +258,7 @@ TEST_F(Bicgstab, OmpBicgstabInitializeIsEquivalentToRef) GKO_EXPECT_MTX_NEAR(d_beta, beta, 1e-14); GKO_EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14); GKO_EXPECT_MTX_NEAR(d_omega, omega, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/omp/test/solver/cg_kernels.cpp b/omp/test/solver/cg_kernels.cpp index db3fd60798e..695789f0205 100644 --- a/omp/test/solver/cg_kernels.cpp +++ b/omp/test/solver/cg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include -#include +#include + + +#include "core/solver/cg_kernels.hpp" +#include "core/test/utils.hpp" + namespace { @@ -192,7 +195,7 @@ TEST_F(Cg, OmpCgInitializeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_q, q, 1e-14); GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/omp/test/solver/cgs_kernels.cpp b/omp/test/solver/cgs_kernels.cpp index 51b45f0ab5b..7fabfe22e93 100644 --- a/omp/test/solver/cgs_kernels.cpp +++ b/omp/test/solver/cgs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include -#include +#include + + +#include "core/solver/cgs_kernels.hpp" +#include "core/test/utils.hpp" namespace { @@ -248,7 +250,7 @@ TEST_F(Cgs, OmpCgsInitializeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14); GKO_ASSERT_MTX_NEAR(d_beta, beta, 1e-14); GKO_ASSERT_MTX_NEAR(d_gamma, gamma, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/omp/test/solver/fcg_kernels.cpp b/omp/test/solver/fcg_kernels.cpp index 5935d1233ea..af7fe606413 100644 --- a/omp/test/solver/fcg_kernels.cpp +++ b/omp/test/solver/fcg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,20 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include #include #include -#include +#include + + +#include "core/solver/fcg_kernels.hpp" +#include "core/test/utils.hpp" + namespace { @@ -206,7 +209,7 @@ TEST_F(Fcg, OmpFcgInitializeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14); GKO_ASSERT_MTX_NEAR(d_rho, rho, 1e-14); GKO_ASSERT_MTX_NEAR(d_rho_t, rho_t, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } diff --git a/omp/test/solver/gmres_kernels.cpp b/omp/test/solver/gmres_kernels.cpp index 229d6aa1c1b..aa845b21ff0 100644 --- a/omp/test/solver/gmres_kernels.cpp +++ b/omp/test/solver/gmres_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/solver/gmres_kernels.hpp" +#include #include @@ -41,13 +41,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include -#include #include #include -#include +#include +#include "core/solver/gmres_kernels.hpp" #include "core/test/utils.hpp" @@ -56,7 +57,14 @@ namespace { class Gmres : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = gko::default_precision; + using index_type = gko::int32; + using Mtx = gko::matrix::Dense; + using norm_type = gko::remove_complex; + using NormVector = gko::matrix::Dense; + template + using Dense = typename gko::matrix::Dense; + Gmres() : rand_engine(30) {} void SetUp() @@ -72,12 +80,13 @@ class Gmres : public ::testing::Test { } } - std::unique_ptr gen_mtx(int num_rows, int num_cols) + template + std::unique_ptr> gen_mtx(int num_rows, int num_cols) { - return gko::test::generate_random_matrix( + return gko::test::generate_random_matrix>( num_rows, num_cols, - std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + std::uniform_int_distribution(num_cols, num_cols), + std::normal_distribution(-1.0, 1.0), rand_engine, ref); } void initialize_data() @@ -88,14 +97,12 @@ class Gmres : public ::testing::Test { y = gen_mtx(gko::solver::default_krylov_dim, n); before_preconditioner = Mtx::create_with_config_of(x.get()); b = gen_mtx(m, n); - b_norm = gen_mtx(1, n); - krylov_bases = gen_mtx(m, (gko::solver::default_krylov_dim + 1) * n); - next_krylov_basis = gen_mtx(m, n); + krylov_bases = gen_mtx(m * (gko::solver::default_krylov_dim + 1), n); hessenberg = gen_mtx(gko::solver::default_krylov_dim + 1, gko::solver::default_krylov_dim * n); hessenberg_iter = gen_mtx(gko::solver::default_krylov_dim + 1, n); residual = gen_mtx(m, n); - residual_norm = gen_mtx(1, n); + residual_norm = gen_mtx(1, n); residual_norm_collection = gen_mtx(gko::solver::default_krylov_dim + 1, n); givens_sin = gen_mtx(gko::solver::default_krylov_dim, n); @@ -118,19 +125,15 @@ class Gmres : public ::testing::Test { d_y->copy_from(y.get()); d_b = Mtx::create(omp); d_b->copy_from(b.get()); - d_b_norm = Mtx::create(omp); - d_b_norm->copy_from(b_norm.get()); d_krylov_bases = Mtx::create(omp); d_krylov_bases->copy_from(krylov_bases.get()); - d_next_krylov_basis = Mtx::create(omp); - d_next_krylov_basis->copy_from(next_krylov_basis.get()); d_hessenberg = Mtx::create(omp); d_hessenberg->copy_from(hessenberg.get()); d_hessenberg_iter = Mtx::create(omp); d_hessenberg_iter->copy_from(hessenberg_iter.get()); d_residual = Mtx::create(omp); d_residual->copy_from(residual.get()); - d_residual_norm = Mtx::create(omp); + d_residual_norm = NormVector::create(omp); d_residual_norm->copy_from(residual_norm.get()); d_residual_norm_collection = Mtx::create(omp); d_residual_norm_collection->copy_from(residual_norm_collection.get()); @@ -155,13 +158,11 @@ class Gmres : public ::testing::Test { std::unique_ptr x; std::unique_ptr y; std::unique_ptr b; - std::unique_ptr b_norm; std::unique_ptr krylov_bases; - std::unique_ptr next_krylov_basis; std::unique_ptr hessenberg; std::unique_ptr hessenberg_iter; std::unique_ptr residual; - std::unique_ptr residual_norm; + std::unique_ptr residual_norm; std::unique_ptr residual_norm_collection; std::unique_ptr givens_sin; std::unique_ptr givens_cos; @@ -172,13 +173,11 @@ class Gmres : public ::testing::Test { std::unique_ptr d_before_preconditioner; std::unique_ptr d_y; std::unique_ptr d_b; - std::unique_ptr d_b_norm; std::unique_ptr d_krylov_bases; - std::unique_ptr d_next_krylov_basis; std::unique_ptr d_hessenberg; std::unique_ptr d_hessenberg_iter; std::unique_ptr d_residual; - std::unique_ptr d_residual_norm; + std::unique_ptr d_residual_norm; std::unique_ptr d_residual_norm_collection; std::unique_ptr d_givens_sin; std::unique_ptr d_givens_cos; @@ -192,18 +191,17 @@ TEST_F(Gmres, OmpGmresInitialize1IsEquivalentToRef) initialize_data(); gko::kernels::reference::gmres::initialize_1( - ref, b.get(), b_norm.get(), residual.get(), givens_sin.get(), - givens_cos.get(), stop_status.get(), gko::solver::default_krylov_dim); + ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), + stop_status.get(), gko::solver::default_krylov_dim); gko::kernels::omp::gmres::initialize_1( - omp, d_b.get(), d_b_norm.get(), d_residual.get(), d_givens_sin.get(), + omp, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get(), gko::solver::default_krylov_dim); - GKO_ASSERT_MTX_NEAR(d_b_norm, b_norm, 1e-14); GKO_ASSERT_MTX_NEAR(d_residual, residual, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); } @@ -224,7 +222,7 @@ TEST_F(Gmres, OmpGmresInitialize2IsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, 1e-14); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); } @@ -234,17 +232,15 @@ TEST_F(Gmres, OmpGmresStep1IsEquivalentToRef) int iter = 5; gko::kernels::reference::gmres::step_1( - ref, next_krylov_basis.get(), givens_sin.get(), givens_cos.get(), + ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), - hessenberg_iter.get(), b_norm.get(), iter, final_iter_nums.get(), - stop_status.get()); + hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); gko::kernels::omp::gmres::step_1( - omp, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(), + omp, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), - d_krylov_bases.get(), d_hessenberg_iter.get(), d_b_norm.get(), iter, + d_krylov_bases.get(), d_hessenberg_iter.get(), iter, d_final_iter_nums.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(d_next_krylov_basis, next_krylov_basis, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, 1e-14); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, 1e-14); GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, 1e-14); @@ -252,7 +248,7 @@ TEST_F(Gmres, OmpGmresStep1IsEquivalentToRef) 1e-14); GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, 1e-14); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, 1e-14); - GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); + GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); } diff --git a/omp/test/solver/ir_kernels.cpp b/omp/test/solver/ir_kernels.cpp index 247467af2ca..a1b7b55f448 100644 --- a/omp/test/solver/ir_kernels.cpp +++ b/omp/test/solver/ir_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,21 +33,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include -#include -#include #include #include #include +#include #include #include +#include "core/solver/ir_kernels.hpp" +#include "core/test/utils.hpp" + + namespace { @@ -126,4 +129,124 @@ TEST_F(Ir, ApplyIsEquivalentToRef) } +TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(omp, mtx); + auto d_x = clone(omp, x); + auto d_b = clone(omp, b); + + auto ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + ref)) + .on(ref)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + omp)) + .on(omp)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(omp)) + .on(omp); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres + // iteration gets amplified by the difference in IR. + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Ir, RichardsonApplyIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(omp, mtx); + auto d_x = clone(omp, x); + auto d_b = clone(omp, b); + // Forget about accuracy - Richardson is not going to converge for a random + // matrix, just check that a couple of iterations gives the same result on + // both executors + auto ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_relaxation_factor(0.9) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(omp)) + .with_relaxation_factor(0.9) + .on(omp); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14); +} + + +TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef) +{ + auto mtx = gen_mtx(50, 50); + auto x = gen_mtx(50, 3); + auto b = gen_mtx(50, 3); + auto d_mtx = clone(omp, mtx); + auto d_x = clone(omp, x); + auto d_b = clone(omp, b); + auto ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + ref)) + .on(ref)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_relaxation_factor(0.9) + .on(ref); + auto d_ir_factory = + gko::solver::Ir<>::build() + .with_solver( + gko::solver::Gmres<>::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on( + omp)) + .on(omp)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(2u).on(omp)) + .with_relaxation_factor(0.9) + .on(omp); + auto solver = ir_factory->generate(std::move(mtx)); + auto d_solver = d_ir_factory->generate(std::move(d_mtx)); + + solver->apply(lend(b), lend(x)); + d_solver->apply(lend(d_b), lend(d_x)); + + // Note: 1e-12 instead of 1e-14, as the difference in the inner gmres + // iteration gets amplified by the difference in IR. + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + } // namespace diff --git a/omp/test/solver/lower_trs_kernels.cpp b/omp/test/solver/lower_trs_kernels.cpp index 0d86e5e65cb..52c21bfa356 100644 --- a/omp/test/solver/lower_trs_kernels.cpp +++ b/omp/test/solver/lower_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/test/solver/upper_trs_kernels.cpp b/omp/test/solver/upper_trs_kernels.cpp index 3d322fe9c3d..db6097f6623 100644 --- a/omp/test/solver/upper_trs_kernels.cpp +++ b/omp/test/solver/upper_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/test/stop/CMakeLists.txt b/omp/test/stop/CMakeLists.txt index 5e686b8fbb4..0ba0781e077 100644 --- a/omp/test/stop/CMakeLists.txt +++ b/omp/test/stop/CMakeLists.txt @@ -1,2 +1,2 @@ ginkgo_create_test(criterion_kernels) -ginkgo_create_test(residual_norm_reduction_kernels) +ginkgo_create_test(residual_norm_kernels) diff --git a/omp/test/stop/criterion_kernels.cpp b/omp/test/stop/criterion_kernels.cpp index df98120cffd..8ab87f2b6f2 100644 --- a/omp/test/stop/criterion_kernels.cpp +++ b/omp/test/stop/criterion_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,12 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include -#include #include +#include + + namespace { diff --git a/omp/test/stop/residual_norm_kernels.cpp b/omp/test/stop/residual_norm_kernels.cpp new file mode 100644 index 00000000000..3d33fb59628 --- /dev/null +++ b/omp/test/stop/residual_norm_kernels.cpp @@ -0,0 +1,348 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class ResidualNormReduction : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + + ResidualNormReduction() + { + omp_ = gko::OmpExecutor::create(); + factory_ = gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(omp_); + } + + std::unique_ptr::Factory> + factory_; + std::shared_ptr omp_; +}; + +TYPED_TEST_CASE(ResidualNormReduction, gko::test::ValueTypes); + + +TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoal) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + auto initial_res = gko::initialize({100.0}, this->omp_); + std::shared_ptr rhs = gko::initialize({10.0}, this->omp_); + auto res_norm = gko::initialize({100.0}, this->omp_); + auto criterion = + this->factory_->generate(nullptr, rhs, nullptr, initial_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->omp_, 1); + stop_status.get_data()[0].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0) = r::value * 1.1e+2; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); + ASSERT_EQ(one_changed, false); + + res_norm->at(0) = r::value * 0.9e+2; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + using T = TypeParam; + using T_nc = gko::remove_complex; + auto res = gko::initialize({I{100.0, 100.0}}, this->omp_); + auto res_norm = + gko::initialize({I{100.0, 100.0}}, this->omp_); + std::shared_ptr rhs = + gko::initialize({I{10.0, 10.0}}, this->omp_); + auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->omp_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0, 0) = r::value * 0.9e+2; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); + + res_norm->at(0, 1) = r::value * 0.9e+2; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +template +class RelativeResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + + RelativeResidualNorm() + { + omp_ = gko::OmpExecutor::create(); + factory_ = gko::stop::RelativeResidualNorm::build() + .with_tolerance(r::value) + .on(omp_); + } + + std::unique_ptr::Factory> + factory_; + std::shared_ptr omp_; +}; + +TYPED_TEST_CASE(RelativeResidualNorm, gko::test::ValueTypes); + + +TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoal) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + auto initial_res = gko::initialize({100.0}, this->omp_); + std::shared_ptr rhs = gko::initialize({10.0}, this->omp_); + auto res_norm = gko::initialize({100.0}, this->omp_); + auto criterion = + this->factory_->generate(nullptr, rhs, nullptr, initial_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->omp_, 1); + stop_status.get_data()[0].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0) = r::value * 1.1e+1; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); + ASSERT_EQ(one_changed, false); + + res_norm->at(0) = r::value * 0.9e+1; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + using T = TypeParam; + using T_nc = gko::remove_complex; + auto res = gko::initialize({I{100.0, 100.0}}, this->omp_); + auto res_norm = + gko::initialize({I{100.0, 100.0}}, this->omp_); + std::shared_ptr rhs = + gko::initialize({I{10.0, 10.0}}, this->omp_); + auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->omp_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0, 0) = r::value * 0.9e+1; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); + + res_norm->at(0, 1) = r::value * 0.9e+1; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +template +class AbsoluteResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + + AbsoluteResidualNorm() + { + omp_ = gko::OmpExecutor::create(); + factory_ = gko::stop::AbsoluteResidualNorm::build() + .with_tolerance(r::value) + .on(omp_); + } + + std::unique_ptr::Factory> + factory_; + std::shared_ptr omp_; +}; + +TYPED_TEST_CASE(AbsoluteResidualNorm, gko::test::ValueTypes); + + +TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoal) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + auto initial_res = gko::initialize({100.0}, this->omp_); + std::shared_ptr rhs = gko::initialize({10.0}, this->omp_); + auto res_norm = gko::initialize({100.0}, this->omp_); + auto criterion = + this->factory_->generate(nullptr, rhs, nullptr, initial_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->omp_, 1); + stop_status.get_data()[0].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0) = r::value * 1.1; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); + ASSERT_EQ(one_changed, false); + + res_norm->at(0) = r::value * 0.9; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + using T = TypeParam; + using T_nc = gko::remove_complex; + auto res = gko::initialize({I{100.0, 100.0}}, this->omp_); + auto res_norm = + gko::initialize({I{100.0, 100.0}}, this->omp_); + std::shared_ptr rhs = + gko::initialize({I{10.0, 10.0}}, this->omp_); + auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->omp_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0, 0) = r::value * 0.9; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); + + res_norm->at(0, 1) = r::value * 0.9; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +} // namespace diff --git a/omp/test/stop/residual_norm_reduction_kernels.cpp b/omp/test/stop/residual_norm_reduction_kernels.cpp deleted file mode 100644 index e528811a9bb..00000000000 --- a/omp/test/stop/residual_norm_reduction_kernels.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include - - -namespace { - - -constexpr double reduction_factor = 1.0e-14; - - -class ResidualNormReduction : public ::testing::Test { -protected: - using Mtx = gko::matrix::Dense<>; - - ResidualNormReduction() - { - omp_ = gko::OmpExecutor::create(); - factory_ = gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(reduction_factor) - .on(omp_); - } - - std::unique_ptr::Factory> factory_; - std::shared_ptr omp_; -}; - - -TEST_F(ResidualNormReduction, WaitsTillResidualGoal) -{ - auto scalar = gko::initialize({1.0}, omp_); - auto criterion = - factory_->generate(nullptr, nullptr, nullptr, scalar.get()); - bool one_changed{}; - constexpr gko::uint8 RelativeStoppingId{1}; - gko::Array stop_status(omp_, 1); - stop_status.get_data()[0].reset(); - - ASSERT_FALSE( - criterion->update() - .residual_norm(scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - - scalar->at(0) = reduction_factor * 1.0e+2; - ASSERT_FALSE( - criterion->update() - .residual_norm(scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); - ASSERT_EQ(one_changed, false); - - scalar->at(0) = reduction_factor * 1.0e-2; - ASSERT_TRUE( - criterion->update() - .residual_norm(scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); - ASSERT_EQ(one_changed, true); -} - - -TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) -{ - auto mtx = gko::initialize({{1.0, 1.0}}, omp_); - auto criterion = factory_->generate(nullptr, nullptr, nullptr, mtx.get()); - bool one_changed{}; - constexpr gko::uint8 RelativeStoppingId{1}; - gko::Array stop_status(omp_, 2); - stop_status.get_data()[0].reset(); - stop_status.get_data()[1].reset(); - - ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check( - RelativeStoppingId, true, &stop_status, &one_changed)); - - mtx->at(0, 0) = reduction_factor * 1.0e-2; - ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check( - RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); - ASSERT_EQ(one_changed, true); - - mtx->at(0, 1) = reduction_factor * 1.0e-2; - ASSERT_TRUE(criterion->update().residual_norm(mtx.get()).check( - RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); - ASSERT_EQ(one_changed, true); -} - - -} // namespace diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 7516fc8641f..9cb2256bf13 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -1,32 +1,45 @@ add_library(ginkgo_reference $ "") target_sources(ginkgo_reference PRIVATE - base/version.cpp - factorization/par_ilu_kernels.cpp - matrix/coo_kernels.cpp - matrix/csr_kernels.cpp - matrix/dense_kernels.cpp - matrix/ell_kernels.cpp - matrix/hybrid_kernels.cpp - matrix/sellp_kernels.cpp - matrix/sparsity_csr_kernels.cpp - preconditioner/jacobi_kernels.cpp - solver/bicgstab_kernels.cpp - solver/cg_kernels.cpp - solver/cgs_kernels.cpp - solver/fcg_kernels.cpp - solver/gmres_kernels.cpp - solver/ir_kernels.cpp - solver/lower_trs_kernels.cpp - solver/upper_trs_kernels.cpp - stop/criterion_kernels.cpp - stop/residual_norm_reduction_kernels.cpp) + base/version.cpp + components/fill_array.cpp + components/precision_conversion.cpp + components/prefix_sum.cpp + factorization/ilu_kernels.cpp + factorization/factorization_kernels.cpp + factorization/par_ict_kernels.cpp + factorization/par_ilu_kernels.cpp + factorization/par_ilut_kernels.cpp + matrix/coo_kernels.cpp + matrix/csr_kernels.cpp + matrix/dense_kernels.cpp + matrix/ell_kernels.cpp + matrix/hybrid_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + preconditioner/isai_kernels.cpp + preconditioner/jacobi_kernels.cpp + solver/bicg_kernels.cpp + solver/bicgstab_kernels.cpp + solver/cg_kernels.cpp + solver/cgs_kernels.cpp + solver/fcg_kernels.cpp + solver/gmres_kernels.cpp + solver/ir_kernels.cpp + solver/lower_trs_kernels.cpp + solver/upper_trs_kernels.cpp + stop/criterion_kernels.cpp + stop/residual_norm_kernels.cpp) ginkgo_compile_features(ginkgo_reference) ginkgo_default_includes(ginkgo_reference) ginkgo_install_library(ginkgo_reference reference) target_compile_options(ginkgo_reference PRIVATE "${GINKGO_COMPILER_FLAGS}") +if (GINKGO_CHECK_CIRCULAR_DEPS) + ginkgo_check_headers(ginkgo_reference) +endif() + if(GINKGO_BUILD_TESTS) add_subdirectory(test) endif() diff --git a/reference/base/version.cpp b/reference/base/version.cpp index 5f6b82582b2..aac3a23180e 100644 --- a/reference/base/version.cpp +++ b/reference/base/version.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/components/convert_ptrs.hpp b/reference/components/convert_ptrs.hpp index ee007a96edd..bc89f9f2df0 100644 --- a/reference/components/convert_ptrs.hpp +++ b/reference/components/convert_ptrs.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + namespace gko { namespace kernels { namespace reference { diff --git a/reference/components/csr_spgeam.hpp b/reference/components/csr_spgeam.hpp new file mode 100644 index 00000000000..f09b34d5926 --- /dev/null +++ b/reference/components/csr_spgeam.hpp @@ -0,0 +1,112 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_ +#define GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_ + + +#include + + +#include + + +#include "core/base/utils.hpp" + + +namespace gko { +namespace kernels { +namespace reference { + + +/** + * Adds two (sorted) sparse matrices. + * + * Calls begin_cb(row) on each row to initialize row-local data + * Calls entry_cb(row, col, a_val, b_val, local_data) on each output non-zero + * Calls end_cb(row, local_data) on each row to finalize row-local data + */ +template +void abstract_spgeam(const matrix::Csr *a, + const matrix::Csr *b, + BeginCallback begin_cb, EntryCallback entry_cb, + EndCallback end_cb) +{ + auto num_rows = a->get_size()[0]; + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto b_vals = b->get_const_values(); + constexpr auto sentinel = std::numeric_limits::max(); + for (size_type row = 0; row < num_rows; ++row) { + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto b_begin = b_row_ptrs[row]; + auto b_end = b_row_ptrs[row + 1]; + auto total_size = (a_end - a_begin) + (b_end - b_begin); + bool skip{}; + auto local_data = begin_cb(row); + for (IndexType i = 0; i < total_size; ++i) { + if (skip) { + skip = false; + continue; + } + // load column indices or sentinel + auto a_col = checked_load(a_col_idxs, a_begin, a_end, sentinel); + auto b_col = checked_load(b_col_idxs, b_begin, b_end, sentinel); + auto a_val = + checked_load(a_vals, a_begin, a_end, zero()); + auto b_val = + checked_load(b_vals, b_begin, b_end, zero()); + auto col = min(a_col, b_col); + // callback + entry_cb(row, col, a_col == col ? a_val : zero(), + b_col == col ? b_val : zero(), local_data); + // advance indices + a_begin += (a_col <= b_col); + b_begin += (b_col <= a_col); + skip = a_col == b_col; + } + end_cb(row, local_data); + } +} + + +} // namespace reference +} // namespace kernels +} // namespace gko + + +#endif // GKO_OMP_COMPONENTS_CSR_SPGEAM_HPP_ diff --git a/reference/components/fill_array.cpp b/reference/components/fill_array.cpp new file mode 100644 index 00000000000..23499dabd3b --- /dev/null +++ b/reference/components/fill_array.cpp @@ -0,0 +1,57 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/fill_array.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace components { + + +template +void fill_array(std::shared_ptr exec, ValueType *array, + size_type n, ValueType val) +{ + std::fill_n(array, n, val); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_FILL_ARRAY_KERNEL); +template GKO_DECLARE_FILL_ARRAY_KERNEL(size_type); + + +} // namespace components +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/components/format_conversion.hpp b/reference/components/format_conversion.hpp index 95e17374b88..38520dd8b66 100644 --- a/reference/components/format_conversion.hpp +++ b/reference/components/format_conversion.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + namespace gko { namespace kernels { namespace reference { diff --git a/reference/components/matrix_operations.hpp b/reference/components/matrix_operations.hpp index 2214e4cb06d..b62a3a84ec8 100644 --- a/reference/components/matrix_operations.hpp +++ b/reference/components/matrix_operations.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/components/precision_conversion.cpp b/reference/components/precision_conversion.cpp new file mode 100644 index 00000000000..6bc37efe940 --- /dev/null +++ b/reference/components/precision_conversion.cpp @@ -0,0 +1,58 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/precision_conversion.hpp" + + +#include + + +namespace gko { +namespace kernels { +namespace reference { +namespace components { + + +template +void convert_precision(std::shared_ptr exec, + size_type size, const SourceType *in, TargetType *out) +{ + std::copy_n(in, size, out); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(GKO_DECLARE_CONVERT_PRECISION_KERNEL); + + +} // namespace components +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/components/prefix_sum.cpp b/reference/components/prefix_sum.cpp new file mode 100644 index 00000000000..2530e960a68 --- /dev/null +++ b/reference/components/prefix_sum.cpp @@ -0,0 +1,63 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace components { + + +template +void prefix_sum(std::shared_ptr exec, + IndexType *counts, size_type num_entries) +{ + IndexType partial_sum{}; + for (IndexType i = 0; i < num_entries; ++i) { + auto nnz = counts[i]; + counts[i] = partial_sum; + partial_sum += nnz; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); + +// instantiate for size_type as well, as this is used in the Sellp format +template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); + + +} // namespace components +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp new file mode 100644 index 00000000000..feb64a56b40 --- /dev/null +++ b/reference/factorization/factorization_kernels.cpp @@ -0,0 +1,327 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/factorization_kernels.hpp" + + +#include +#include + + +#include +#include + + +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +template +size_type count_missing_elements(IndexType num_rows, IndexType num_cols, + const IndexType *col_idxs, + const IndexType *row_ptrs) +{ + size_type missing_elements{}; + // if row >= num_cols, diagonal elements no longer exist + for (IndexType row = 0; row < num_rows && row < num_cols; ++row) { + bool was_diagonal_found{false}; + for (IndexType idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) { + const auto col = col_idxs[idx]; + if (col == row) { + was_diagonal_found = true; + break; + } + } + if (!was_diagonal_found) { + ++missing_elements; + } + } + return missing_elements; +} + + +template +void add_diagonal_elements(std::shared_ptr exec, + matrix::Csr *mtx, + bool /*is_sorted*/) +{ + const auto values = mtx->get_const_values(); + const auto col_idxs = mtx->get_const_col_idxs(); + auto row_ptrs = mtx->get_row_ptrs(); + auto num_rows = static_cast(mtx->get_size()[0]); + auto num_cols = static_cast(mtx->get_size()[1]); + + auto missing_elements = + count_missing_elements(num_rows, num_cols, col_idxs, row_ptrs); + + if (missing_elements == 0) { + return; + } + + const auto old_nnz = mtx->get_num_stored_elements(); + const size_type new_nnz = old_nnz + missing_elements; + Array new_values_array{exec, new_nnz}; + Array new_col_idxs_array{exec, new_nnz}; + auto new_values = new_values_array.get_data(); + auto new_col_idxs = new_col_idxs_array.get_data(); + IndexType added_elements{}; + // row_ptrs will be updated in-place + + for (IndexType row = 0; row < num_rows; ++row) { + bool diagonal_handled{false}; + const IndexType old_row_ptrs_start{row_ptrs[row]}; + const IndexType old_row_ptrs_end{row_ptrs[row + 1]}; + const IndexType new_row_ptrs_start = + old_row_ptrs_start + added_elements; + + row_ptrs[row] = new_row_ptrs_start; + for (IndexType old_idx = old_row_ptrs_start; old_idx < old_row_ptrs_end; + ++old_idx) { + auto new_idx = old_idx + added_elements; + const auto col_idx = col_idxs[old_idx]; + if (!diagonal_handled && col_idx > row) { + const auto start_cols = col_idxs + old_idx; + const auto end_cols = col_idxs + old_row_ptrs_end; + // expect row to not be sorted, so search for a diagonal entry + if (std::find(start_cols, end_cols, row) != end_cols) { + // no need to add diagonal since diagonal is already present + diagonal_handled = true; + } + // if diagonal was not found, add it + if (!diagonal_handled) { + new_values[new_idx] = zero(); + new_col_idxs[new_idx] = row; + ++added_elements; + new_idx = old_idx + added_elements; + diagonal_handled = true; + } + } + if (row >= num_cols || col_idx == row) { + diagonal_handled = true; + } + new_values[new_idx] = values[old_idx]; + new_col_idxs[new_idx] = col_idx; + } + if (row < num_cols && !diagonal_handled) { + const auto new_idx = old_row_ptrs_end + added_elements; + new_values[new_idx] = zero(); + new_col_idxs[new_idx] = row; + diagonal_handled = true; + ++added_elements; + } + } + row_ptrs[num_rows] = new_nnz; + + matrix::CsrBuilder mtx_builder{mtx}; + mtx_builder.get_value_array() = std::move(new_values_array); + mtx_builder.get_col_idx_array() = std::move(new_col_idxs_array); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL); + + +template +void initialize_row_ptrs_l_u( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs, IndexType *u_row_ptrs) +{ + auto row_ptrs = system_matrix->get_const_row_ptrs(); + auto col_idxs = system_matrix->get_const_col_idxs(); + size_type l_nnz{}; + size_type u_nnz{}; + + l_row_ptrs[0] = 0; + u_row_ptrs[0] = 0; + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + size_type col = col_idxs[el]; + // don't count diagonal + l_nnz += col < row; + u_nnz += col > row; + } + // add diagonal again + l_nnz++; + u_nnz++; + l_row_ptrs[row + 1] = l_nnz; + u_row_ptrs[row + 1] = u_nnz; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL); + + +template +void initialize_l_u(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, + matrix::Csr *csr_u) +{ + const auto row_ptrs = system_matrix->get_const_row_ptrs(); + const auto col_idxs = system_matrix->get_const_col_idxs(); + const auto vals = system_matrix->get_const_values(); + + const auto row_ptrs_l = csr_l->get_const_row_ptrs(); + auto col_idxs_l = csr_l->get_col_idxs(); + auto vals_l = csr_l->get_values(); + + const auto row_ptrs_u = csr_u->get_const_row_ptrs(); + auto col_idxs_u = csr_u->get_col_idxs(); + auto vals_u = csr_u->get_values(); + + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + size_type current_index_l = row_ptrs_l[row]; + size_type current_index_u = + row_ptrs_u[row] + 1; // we treat the diagonal separately + // if there is no diagonal value, set it to 1 by default + auto diag_val = one(); + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + const auto col = col_idxs[el]; + const auto val = vals[el]; + if (col < row) { + col_idxs_l[current_index_l] = col; + vals_l[current_index_l] = val; + ++current_index_l; + } else if (col == row) { + // save diagonal value + diag_val = val; + } else { // col > row + col_idxs_u[current_index_u] = col; + vals_u[current_index_u] = val; + ++current_index_u; + } + } + // store diagonal values separately + auto l_diag_idx = row_ptrs_l[row + 1] - 1; + auto u_diag_idx = row_ptrs_u[row]; + col_idxs_l[l_diag_idx] = row; + col_idxs_u[u_diag_idx] = row; + vals_l[l_diag_idx] = one(); + vals_u[u_diag_idx] = diag_val; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL); + + +template +void initialize_row_ptrs_l( + std::shared_ptr exec, + const matrix::Csr *system_matrix, + IndexType *l_row_ptrs) +{ + auto row_ptrs = system_matrix->get_const_row_ptrs(); + auto col_idxs = system_matrix->get_const_col_idxs(); + size_type l_nnz{}; + + l_row_ptrs[0] = 0; + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + size_type col = col_idxs[el]; + // skip diagonal + l_nnz += col < row; + } + // add diagonal again + l_nnz++; + l_row_ptrs[row + 1] = l_nnz; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL); + + +template +void initialize_l(std::shared_ptr exec, + const matrix::Csr *system_matrix, + matrix::Csr *csr_l, bool diag_sqrt) +{ + const auto row_ptrs = system_matrix->get_const_row_ptrs(); + const auto col_idxs = system_matrix->get_const_col_idxs(); + const auto vals = system_matrix->get_const_values(); + + const auto row_ptrs_l = csr_l->get_const_row_ptrs(); + auto col_idxs_l = csr_l->get_col_idxs(); + auto vals_l = csr_l->get_values(); + + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + size_type current_index_l = row_ptrs_l[row]; + // if there is no diagonal value, set it to 1 by default + auto diag_val = one(); + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + const auto col = col_idxs[el]; + const auto val = vals[el]; + if (col < row) { + col_idxs_l[current_index_l] = col; + vals_l[current_index_l] = val; + ++current_index_l; + } else if (col == row) { + // save diagonal value + diag_val = val; + } + } + // store diagonal values separately + auto l_diag_idx = row_ptrs_l[row + 1] - 1; + col_idxs_l[l_diag_idx] = row; + // compute square root with sentinel + if (diag_sqrt) { + diag_val = sqrt(diag_val); + if (!is_finite(diag_val)) { + diag_val = one(); + } + } + vals_l[l_diag_idx] = diag_val; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp new file mode 100644 index 00000000000..279bfda5c18 --- /dev/null +++ b/reference/factorization/ilu_kernels.cpp @@ -0,0 +1,58 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/ilu_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The ilu factorization namespace. + * + * @ingroup factor + */ +namespace ilu_factorization { + + +template +void compute_lu(std::shared_ptr exec, + matrix::Csr *m) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ILU_COMPUTE_LU_KERNEL); + + +} // namespace ilu_factorization +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp new file mode 100644 index 00000000000..8114d22b493 --- /dev/null +++ b/reference/factorization/par_ict_kernels.cpp @@ -0,0 +1,209 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include + + +#include +#include +#include + + +#include "core/base/utils.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "reference/components/csr_spgeam.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The parallel ict factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + + for (size_type row = 0; row < num_rows; ++row) { + for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1]; + ++l_nz) { + auto col = l_col_idxs[l_nz]; + // find value from A + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto a_nz_it = + std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col); + auto a_nz = std::distance(a_col_idxs, a_nz_it); + auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col; + auto a_val = has_a ? a_vals[a_nz] : zero(); + // accumulate l(row,:) * l(col,:) without the last entry l(col, col) + ValueType sum{}; + IndexType lt_nz{}; + auto l_begin = l_row_ptrs[row]; + auto l_end = l_row_ptrs[row + 1]; + auto lt_begin = l_row_ptrs[col]; + auto lt_end = l_row_ptrs[col + 1]; + while (l_begin < l_end && lt_begin < lt_end) { + auto l_col = l_col_idxs[l_begin]; + auto lt_row = l_col_idxs[lt_begin]; + if (l_col == lt_row && l_col < col) { + sum += l_vals[l_begin] * l_vals[lt_begin]; + } + if (lt_row == row) { + lt_nz = lt_begin; + } + l_begin += (l_col <= lt_row); + lt_begin += (lt_row <= l_col); + } + auto new_val = a_val - sum; + if (row == col) { + new_val = sqrt(new_val); + } else { + auto diag = l_vals[l_row_ptrs[col + 1] - 1]; + new_val = new_val / diag; + } + if (is_finite(new_val)) { + l_vals[l_nz] = new_val; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *llt, + const matrix::Csr *a, + const matrix::Csr *l, + matrix::Csr *l_new) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + constexpr auto sentinel = std::numeric_limits::max(); + // count nnz + IndexType l_nnz{}; + abstract_spgeam( + a, llt, + [&](IndexType row) { + l_new_row_ptrs[row] = l_nnz; + return 0; + }, + [&](IndexType row, IndexType col, ValueType, ValueType, int) { + l_nnz += col <= row; + }, + [](IndexType, int) {}); + l_new_row_ptrs[num_rows] = l_nnz; + + // resize arrays + matrix::CsrBuilder l_builder{l_new}; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // accumulate non-zeros + struct row_state { + IndexType l_new_nz; + IndexType l_old_begin; + IndexType l_old_end; + }; + abstract_spgeam( + a, llt, + [&](IndexType row) { + row_state state{}; + state.l_new_nz = l_new_row_ptrs[row]; + state.l_old_begin = l_row_ptrs[row]; + state.l_old_end = l_row_ptrs[row + 1]; + return state; + }, + [&](IndexType row, IndexType col, ValueType a_val, ValueType llt_val, + row_state &state) { + auto r_val = a_val - llt_val; + // load matching entry of L + auto l_col = checked_load(l_col_idxs, state.l_old_begin, + state.l_old_end, sentinel); + auto l_val = checked_load(l_vals, state.l_old_begin, + state.l_old_end, zero()); + // load diagonal entry of L + auto diag = l_vals[l_row_ptrs[col + 1] - 1]; + // if there is already an entry present, use that + // instead. + auto out_val = l_col == col ? l_val : r_val / diag; + // store output entries + if (row >= col) { + l_new_col_idxs[state.l_new_nz] = col; + l_new_vals[state.l_new_nz] = out_val; + state.l_new_nz++; + } + // advance entry of L if we used it + state.l_old_begin += (l_col == col); + }, + [](IndexType, row_state) {}); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ict_factorization +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp index d7df460e2d5..e2234d52fdf 100644 --- a/reference/factorization/par_ilu_kernels.cpp +++ b/reference/factorization/par_ilu_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/par_ilu_kernels.hpp" +#include + + #include #include #include @@ -49,88 +52,6 @@ namespace reference { namespace par_ilu_factorization { -template -void initialize_row_ptrs_l_u( - std::shared_ptr exec, - const matrix::Csr *system_matrix, - IndexType *l_row_ptrs, IndexType *u_row_ptrs) -{ - auto row_ptrs = system_matrix->get_const_row_ptrs(); - auto col_idxs = system_matrix->get_const_col_idxs(); - size_type l_nnz{}; - size_type u_nnz{}; - - l_row_ptrs[0] = 0; - u_row_ptrs[0] = 0; - for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { - for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { - size_type col = col_idxs[el]; - if (col <= row) { - ++l_nnz; - } - if (col >= row) { - ++u_nnz; - } - } - l_row_ptrs[row + 1] = l_nnz; - u_row_ptrs[row + 1] = u_nnz; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL); - - -template -void initialize_l_u(std::shared_ptr exec, - const matrix::Csr *system_matrix, - matrix::Csr *csr_l, - matrix::Csr *csr_u) -{ - const auto row_ptrs = system_matrix->get_const_row_ptrs(); - const auto col_idxs = system_matrix->get_const_col_idxs(); - const auto vals = system_matrix->get_const_values(); - - const auto row_ptrs_l = csr_l->get_const_row_ptrs(); - auto col_idxs_l = csr_l->get_col_idxs(); - auto vals_l = csr_l->get_values(); - - const auto row_ptrs_u = csr_u->get_const_row_ptrs(); - auto col_idxs_u = csr_u->get_col_idxs(); - auto vals_u = csr_u->get_values(); - - for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { - size_type current_index_l = row_ptrs_l[row]; - size_type current_index_u = row_ptrs_u[row]; - for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { - const auto col = col_idxs[el]; - const auto val = vals[el]; - if (col < row) { - col_idxs_l[current_index_l] = col; - vals_l[current_index_l] = val; - ++current_index_l; - } else if (col == row) { - // Update both L and U - col_idxs_l[current_index_l] = col; - vals_l[current_index_l] = one(); - ++current_index_l; - - col_idxs_u[current_index_u] = col; - vals_u[current_index_u] = val; - ++current_index_u; - } else { // col > row - col_idxs_u[current_index_u] = col; - vals_u[current_index_u] = val; - ++current_index_u; - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILU_INITIALIZE_L_U_KERNEL); - - template void compute_l_u_factors(std::shared_ptr exec, size_type iterations, @@ -182,12 +103,12 @@ void compute_l_u_factors(std::shared_ptr exec, if (row > col) { // modify entry in L auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1]; - if (::gko::isfinite(to_write)) { + if (is_finite(to_write)) { vals_l[row_l - 1] = to_write; } } else { // modify entry in U auto to_write = sum; - if (::gko::isfinite(to_write)) { + if (is_finite(to_write)) { vals_u[row_u - 1] = to_write; } } diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp new file mode 100644 index 00000000000..63df1c5634e --- /dev/null +++ b/reference/factorization/par_ilut_kernels.cpp @@ -0,0 +1,473 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include + + +#include +#include +#include +#include + + +#include "core/base/utils.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "reference/components/csr_spgeam.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The parallel ilut factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +/** + * @internal + * + * Selects the `rank`th smallest element (0-based, magnitude-wise) + * from the values of `m`. It uses two temporary arrays. + */ +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp, + Array> &, + remove_complex &threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + tmp.resize_and_reset(size); + std::copy_n(values, size, tmp.get_data()); + + auto begin = tmp.get_data(); + auto target = begin + rank; + auto end = begin + size; + std::nth_element(begin, target, end, + [](ValueType a, ValueType b) { return abs(a) < abs(b); }); + threshold = abs(*target); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +/** + * Removes all the elements from the input matrix for which pred is false. + * Stores the result in m_out and (if non-null) m_out_coo. + * pred(row, nz) is called for each entry, where nz is the index in + * values/col_idxs. + */ +template +void abstract_filter(std::shared_ptr exec, + const matrix::Csr *m, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, + Predicate pred) +{ + auto num_rows = m->get_size()[0]; + auto row_ptrs = m->get_const_row_ptrs(); + auto col_idxs = m->get_const_col_idxs(); + auto vals = m->get_const_values(); + + // first sweep: count nnz for each row + auto new_row_ptrs = m_out->get_row_ptrs(); + for (size_type row = 0; row < num_rows; ++row) { + IndexType count{}; + for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; ++nz) { + count += pred(row, nz); + } + new_row_ptrs[row] = count; + } + + // build row pointers + components::prefix_sum(exec, new_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = new_row_ptrs[num_rows]; + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType *new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + Array::view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + Array::view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + + for (size_type row = 0; row < num_rows; ++row) { + auto new_nz = new_row_ptrs[row]; + auto begin = row_ptrs[row]; + auto end = row_ptrs[row + 1]; + for (auto nz = begin; nz < end; ++nz) { + if (pred(row, nz)) { + if (new_row_idxs) { + new_row_idxs[new_nz] = row; + } + new_col_idxs[new_nz] = col_idxs[nz]; + new_vals[new_nz] = vals[nz]; + ++new_nz; + } + } + } +} + + +/** + * @internal + * + * Removes all elements below the given threshold from a matrix. + */ +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr *m, + remove_complex threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo, bool) +{ + auto col_idxs = m->get_const_col_idxs(); + auto vals = m->get_const_values(); + abstract_filter( + exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) { + return abs(vals[nz]) >= threshold || col_idxs[nz] == row; + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +constexpr auto bucket_count = 1 << sampleselect_searchtree_height; +constexpr auto sample_size = bucket_count * sampleselect_oversampling; + + +/** + * @internal + * + * Approximately selects the `rank`th smallest element as a threshold + * and removes all elements below this threshold from the input matrix. + */ +template +void threshold_filter_approx(std::shared_ptr exec, + const matrix::Csr *m, + IndexType rank, Array &tmp, + remove_complex &threshold, + matrix::Csr *m_out, + matrix::Coo *m_out_coo) +{ + auto vals = m->get_const_values(); + auto col_idxs = m->get_const_col_idxs(); + auto size = static_cast(m->get_num_stored_elements()); + using AbsType = remove_complex; + constexpr auto storage_size = ceildiv( + sample_size * sizeof(AbsType) + bucket_count * sizeof(IndexType), + sizeof(ValueType)); + tmp.resize_and_reset(storage_size); + // pick and sort sample + auto sample = reinterpret_cast(tmp.get_data()); + // assuming rounding towards zero + auto stride = double(size) / sample_size; + for (IndexType i = 0; i < sample_size; ++i) { + sample[i] = abs(vals[static_cast(i * stride)]); + } + std::sort(sample, sample + sample_size); + // pick splitters + for (IndexType i = 0; i < bucket_count - 1; ++i) { + // shift by one so we get upper bounds for the buckets + sample[i] = sample[(i + 1) * sampleselect_oversampling]; + } + // count elements per bucket + auto histogram = reinterpret_cast(sample + bucket_count); + for (IndexType bucket = 0; bucket < bucket_count; ++bucket) { + histogram[bucket] = 0; + } + for (IndexType nz = 0; nz < size; ++nz) { + auto bucket_it = + std::upper_bound(sample, sample + bucket_count - 1, abs(vals[nz])); + auto bucket = std::distance(sample, bucket_it); + // smallest bucket s.t. sample[bucket] >= abs(val[nz]) + histogram[bucket]++; + } + // determine splitter ranks: prefix sum over bucket counts + components::prefix_sum(exec, histogram, bucket_count + 1); + // determine the bucket containing the threshold rank: + // prefix_sum[bucket] <= rank < prefix_sum[bucket + 1] + auto it = std::upper_bound(histogram, histogram + bucket_count + 1, rank); + auto threshold_bucket = std::distance(histogram + 1, it); + // sample contains upper bounds for the buckets + threshold = threshold_bucket > 0 ? sample[threshold_bucket - 1] + : zero>(); + // filter elements + abstract_filter( + exec, m, m_out, m_out_coo, [&](IndexType row, IndexType nz) { + return abs(vals[nz]) >= threshold || col_idxs[nz] == row; + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); + + +/** + * @internal + * + * Computes a ParILUT sweep on the input matrices. + */ +template +void compute_l_u_factors(std::shared_ptr exec, + const matrix::Csr *a, + matrix::Csr *l, + const matrix::Coo *, + matrix::Csr *u, + const matrix::Coo *, + matrix::Csr *u_csc) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_values(); + auto ut_col_ptrs = u_csc->get_const_row_ptrs(); + auto ut_row_idxs = u_csc->get_const_col_idxs(); + auto ut_vals = u_csc->get_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + + auto compute_sum = [&](IndexType row, IndexType col) { + // find value from A + auto a_begin = a_row_ptrs[row]; + auto a_end = a_row_ptrs[row + 1]; + auto a_nz_it = + std::lower_bound(a_col_idxs + a_begin, a_col_idxs + a_end, col); + auto a_nz = std::distance(a_col_idxs, a_nz_it); + auto has_a = a_nz < a_end && a_col_idxs[a_nz] == col; + auto a_val = has_a ? a_vals[a_nz] : zero(); + // accumulate l(row,:) * u(:,col) without the last entry (row, col) + ValueType sum{}; + IndexType ut_nz{}; + auto l_begin = l_row_ptrs[row]; + auto l_end = l_row_ptrs[row + 1]; + auto u_begin = ut_col_ptrs[col]; + auto u_end = ut_col_ptrs[col + 1]; + auto last_entry = min(row, col); + while (l_begin < l_end && u_begin < u_end) { + auto l_col = l_col_idxs[l_begin]; + auto u_row = ut_row_idxs[u_begin]; + if (l_col == u_row && l_col < last_entry) { + sum += l_vals[l_begin] * ut_vals[u_begin]; + } + if (u_row == row) { + ut_nz = u_begin; + } + l_begin += (l_col <= u_row); + u_begin += (u_row <= l_col); + } + return std::make_pair(a_val - sum, ut_nz); + }; + + for (size_type row = 0; row < num_rows; ++row) { + for (size_type l_nz = l_row_ptrs[row]; l_nz < l_row_ptrs[row + 1] - 1; + ++l_nz) { + auto col = l_col_idxs[l_nz]; + auto u_diag = ut_vals[ut_col_ptrs[col + 1] - 1]; + auto new_val = compute_sum(row, col).first / u_diag; + if (is_finite(new_val)) { + l_vals[l_nz] = new_val; + } + } + for (size_type u_nz = u_row_ptrs[row]; u_nz < u_row_ptrs[row + 1]; + ++u_nz) { + auto col = u_col_idxs[u_nz]; + auto result = compute_sum(row, col); + auto new_val = result.first; + auto ut_nz = result.second; + if (is_finite(new_val)) { + u_vals[u_nz] = new_val; + ut_vals[ut_nz] = new_val; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + + +/** + * @internal + * + * Adds new entries from the sparsity pattern of A - L * U + * to L and U, where new values are chosen based on the residual + * value divided by the corresponding diagonal entry. + */ +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr *lu, + const matrix::Csr *a, + const matrix::Csr *l, + const matrix::Csr *u, + matrix::Csr *l_new, + matrix::Csr *u_new) +{ + auto num_rows = a->get_size()[0]; + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + constexpr auto sentinel = std::numeric_limits::max(); + // count nnz + IndexType l_nnz{}; + IndexType u_nnz{}; + abstract_spgeam( + a, lu, + [&](IndexType row) { + l_new_row_ptrs[row] = l_nnz; + u_new_row_ptrs[row] = u_nnz; + return 0; + }, + [&](IndexType row, IndexType col, ValueType, ValueType, int) { + l_nnz += col <= row; + u_nnz += col >= row; + }, + [](IndexType, int) {}); + l_new_row_ptrs[num_rows] = l_nnz; + u_new_row_ptrs[num_rows] = u_nnz; + + // resize arrays + matrix::CsrBuilder l_builder{l_new}; + matrix::CsrBuilder u_builder{u_new}; + l_builder.get_col_idx_array().resize_and_reset(l_nnz); + l_builder.get_value_array().resize_and_reset(l_nnz); + u_builder.get_col_idx_array().resize_and_reset(u_nnz); + u_builder.get_value_array().resize_and_reset(u_nnz); + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + // accumulate non-zeros + struct row_state { + IndexType l_new_nz; + IndexType u_new_nz; + IndexType l_old_begin; + IndexType l_old_end; + IndexType u_old_begin; + IndexType u_old_end; + bool finished_l; + }; + abstract_spgeam( + a, lu, + [&](IndexType row) { + row_state state{}; + state.l_new_nz = l_new_row_ptrs[row]; + state.u_new_nz = u_new_row_ptrs[row]; + state.l_old_begin = l_row_ptrs[row]; + state.l_old_end = l_row_ptrs[row + 1] - 1; // skip diagonal + state.u_old_begin = u_row_ptrs[row]; + state.u_old_end = u_row_ptrs[row + 1]; + state.finished_l = (state.l_old_begin == state.l_old_end); + return state; + }, + [&](IndexType row, IndexType col, ValueType a_val, ValueType lu_val, + row_state &state) { + auto r_val = a_val - lu_val; + // load matching entry of L + U + auto lpu_col = state.finished_l + ? checked_load(u_col_idxs, state.u_old_begin, + state.u_old_end, sentinel) + : l_col_idxs[state.l_old_begin]; + auto lpu_val = + state.finished_l + ? checked_load(u_vals, state.u_old_begin, state.u_old_end, + zero()) + : l_vals[state.l_old_begin]; + // load diagonal entry of U for lower diagonal entries + auto diag = col < row ? u_vals[u_row_ptrs[col]] : one(); + // if there is already an entry present, use that instead. + auto out_val = lpu_col == col ? lpu_val : r_val / diag; + // store output entries + if (row >= col) { + l_new_col_idxs[state.l_new_nz] = col; + l_new_vals[state.l_new_nz] = + row == col ? one() : out_val; + state.l_new_nz++; + } + if (row <= col) { + u_new_col_idxs[state.u_new_nz] = col; + u_new_vals[state.u_new_nz] = out_val; + state.u_new_nz++; + } + // advance entry of L + U if we used it + if (state.finished_l) { + state.u_old_begin += (lpu_col == col); + } else { + state.l_old_begin += (lpu_col == col); + state.finished_l = (state.l_old_begin == state.l_old_end); + } + }, + [](IndexType, row_state) {}); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp index 76f15e45766..74a0355be68 100644 --- a/reference/matrix/coo_kernels.cpp +++ b/reference/matrix/coo_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -145,8 +145,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr exec, template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Coo *source) + const matrix::Coo *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; @@ -165,8 +165,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Coo *source) + const matrix::Coo *source, + matrix::Dense *result) { auto coo_val = source->get_const_values(); auto coo_col = source->get_const_col_idxs(); diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 2d2adcae1d5..18d39412c95 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,11 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include #include #include +#include #include #include #include @@ -48,7 +49,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/allocator.hpp" #include "core/base/iterator_factory.hpp" +#include "core/components/prefix_sum.hpp" +#include "core/matrix/csr_builder.hpp" +#include "reference/components/csr_spgeam.hpp" #include "reference/components/format_conversion.hpp" @@ -123,6 +128,234 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +template +void spgemm_insert_row(unordered_set &cols, + const matrix::Csr *c, + size_type row) +{ + auto row_ptrs = c->get_const_row_ptrs(); + auto col_idxs = c->get_const_col_idxs(); + cols.insert(col_idxs + row_ptrs[row], col_idxs + row_ptrs[row + 1]); +} + + +template +void spgemm_insert_row2(unordered_set &cols, + const matrix::Csr *a, + const matrix::Csr *b, + size_type row) +{ + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + for (size_type a_nz = a_row_ptrs[row]; + a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) { + auto a_col = a_col_idxs[a_nz]; + auto b_row = a_col; + cols.insert(b_col_idxs + b_row_ptrs[b_row], + b_col_idxs + b_row_ptrs[b_row + 1]); + } +} + + +template +void spgemm_accumulate_row(map &cols, + const matrix::Csr *c, + ValueType scale, size_type row) +{ + auto row_ptrs = c->get_const_row_ptrs(); + auto col_idxs = c->get_const_col_idxs(); + auto vals = c->get_const_values(); + for (size_type c_nz = row_ptrs[row]; c_nz < size_type(row_ptrs[row + 1]); + ++c_nz) { + auto c_col = col_idxs[c_nz]; + auto c_val = vals[c_nz]; + cols[c_col] += scale * c_val; + } +} + + +template +void spgemm_accumulate_row2(map &cols, + const matrix::Csr *a, + const matrix::Csr *b, + ValueType scale, size_type row) +{ + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto b_vals = b->get_const_values(); + for (size_type a_nz = a_row_ptrs[row]; + a_nz < size_type(a_row_ptrs[row + 1]); ++a_nz) { + auto a_col = a_col_idxs[a_nz]; + auto a_val = a_vals[a_nz]; + auto b_row = a_col; + for (size_type b_nz = b_row_ptrs[b_row]; + b_nz < size_type(b_row_ptrs[b_row + 1]); ++b_nz) { + auto b_col = b_col_idxs[b_nz]; + auto b_val = b_vals[b_nz]; + cols[b_col] += scale * a_val * b_val; + } + } +} + + +template +void spgemm(std::shared_ptr exec, + const matrix::Csr *a, + const matrix::Csr *b, + matrix::Csr *c) +{ + auto num_rows = a->get_size()[0]; + + // first sweep: count nnz for each row + auto c_row_ptrs = c->get_row_ptrs(); + + unordered_set local_col_idxs(exec); + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_col_idxs.clear(); + spgemm_insert_row2(local_col_idxs, a, b, a_row); + c_row_ptrs[a_row] = local_col_idxs.size(); + } + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = c_row_ptrs[num_rows]; + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + c_col_idxs_array.resize_and_reset(new_nnz); + c_vals_array.resize_and_reset(new_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + + map local_row_nzs(exec); + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_row_nzs.clear(); + spgemm_accumulate_row2(local_row_nzs, a, b, one(), a_row); + // store result + auto c_nz = c_row_ptrs[a_row]; + for (auto pair : local_row_nzs) { + c_col_idxs[c_nz] = pair.first; + c_vals[c_nz] = pair.second; + ++c_nz; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); + + +template +void advanced_spgemm(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Csr *b, + const matrix::Dense *beta, + const matrix::Csr *d, + matrix::Csr *c) +{ + auto num_rows = a->get_size()[0]; + auto valpha = alpha->at(0, 0); + auto vbeta = beta->at(0, 0); + + // first sweep: count nnz for each row + auto c_row_ptrs = c->get_row_ptrs(); + + unordered_set local_col_idxs(exec); + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_col_idxs.clear(); + spgemm_insert_row(local_col_idxs, d, a_row); + spgemm_insert_row2(local_col_idxs, a, b, a_row); + c_row_ptrs[a_row] = local_col_idxs.size(); + } + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = c_row_ptrs[num_rows]; + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + c_col_idxs_array.resize_and_reset(new_nnz); + c_vals_array.resize_and_reset(new_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + + map local_row_nzs(exec); + for (size_type a_row = 0; a_row < num_rows; ++a_row) { + local_row_nzs.clear(); + spgemm_accumulate_row(local_row_nzs, d, vbeta, a_row); + spgemm_accumulate_row2(local_row_nzs, a, b, valpha, a_row); + // store result + auto c_nz = c_row_ptrs[a_row]; + for (auto pair : local_row_nzs) { + c_col_idxs[c_nz] = pair.first; + c_vals[c_nz] = pair.second; + ++c_nz; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); + + +template +void spgeam(std::shared_ptr exec, + const matrix::Dense *alpha, + const matrix::Csr *a, + const matrix::Dense *beta, + const matrix::Csr *b, + matrix::Csr *c) +{ + auto num_rows = a->get_size()[0]; + auto valpha = alpha->at(0, 0); + auto vbeta = beta->at(0, 0); + + // first sweep: count nnz for each row + auto c_row_ptrs = c->get_row_ptrs(); + + abstract_spgeam( + a, b, [](IndexType) { return IndexType{}; }, + [](IndexType, IndexType, ValueType, ValueType, IndexType &nnz) { + ++nnz; + }, + [&](IndexType row, IndexType nnz) { c_row_ptrs[row] = nnz; }); + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, num_rows + 1); + + // second sweep: accumulate non-zeros + auto new_nnz = c_row_ptrs[num_rows]; + matrix::CsrBuilder c_builder{c}; + auto &c_col_idxs_array = c_builder.get_col_idx_array(); + auto &c_vals_array = c_builder.get_value_array(); + c_col_idxs_array.resize_and_reset(new_nnz); + c_vals_array.resize_and_reset(new_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + + abstract_spgeam( + a, b, [&](IndexType row) { return c_row_ptrs[row]; }, + [&](IndexType, IndexType col, ValueType a_val, ValueType b_val, + IndexType &nz) { + c_vals[nz] = valpha * a_val + vbeta * b_val; + c_col_idxs[nz] = col; + ++nz; + }, + [](IndexType, IndexType) {}); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); + + template void convert_row_ptrs_to_idxs(std::shared_ptr exec, const IndexType *ptrs, size_type num_rows, @@ -134,8 +367,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr exec, template void convert_to_coo(std::shared_ptr exec, - matrix::Coo *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Coo *result) { auto num_rows = result->get_size()[0]; @@ -150,8 +383,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -176,8 +409,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sellp(std::shared_ptr exec, - matrix::Sellp *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Sellp *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -239,8 +472,10 @@ void convert_to_sellp(std::shared_ptr exec, } } } - slice_sets[slice_num] = - slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + if (slice_num > 0) { + slice_sets[slice_num] = + slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -281,8 +516,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_ell(std::shared_ptr exec, - matrix::Ell *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Ell *result) { const auto num_rows = source->get_size()[0]; const auto num_cols = source->get_size()[1]; @@ -310,7 +545,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); -template +template inline void convert_csr_to_csc(size_type num_rows, const IndexType *row_ptrs, const IndexType *col_idxs, const ValueType *csr_vals, IndexType *row_idxs, @@ -355,8 +590,8 @@ void transpose_and_transform(std::shared_ptr exec, template void transpose(std::shared_ptr exec, - matrix::Csr *trans, - const matrix::Csr *orig) + const matrix::Csr *orig, + matrix::Csr *trans) { transpose_and_transform(exec, trans, orig, [](const ValueType x) { return x; }); @@ -367,8 +602,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, - matrix::Csr *trans, - const matrix::Csr *orig) + const matrix::Csr *orig, + matrix::Csr *trans) { transpose_and_transform(exec, trans, orig, [](const ValueType x) { return conj(x); }); @@ -400,8 +635,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, - matrix::Hybrid *result, - const matrix::Csr *source) + const matrix::Csr *source, + matrix::Hybrid *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -454,6 +689,139 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); +template +void row_permute_impl(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto orig_vals = orig->get_const_values(); + auto rp_row_ptrs = row_permuted->get_row_ptrs(); + auto rp_col_idxs = row_permuted->get_col_idxs(); + auto rp_vals = row_permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + size_type num_nnz = orig->get_num_stored_elements(); + + size_type cur_ptr = 0; + rp_row_ptrs[0] = cur_ptr; + vector orig_num_nnz_per_row(num_rows, 0, exec); + for (size_type row = 0; row < num_rows; ++row) { + orig_num_nnz_per_row[row] = orig_row_ptrs[row + 1] - orig_row_ptrs[row]; + } + for (size_type row = 0; row < num_rows; ++row) { + rp_row_ptrs[row + 1] = + rp_row_ptrs[row] + orig_num_nnz_per_row[perm[row]]; + } + rp_row_ptrs[num_rows] = orig_row_ptrs[num_rows]; + for (size_type row = 0; row < num_rows; ++row) { + auto new_row = perm[row]; + auto new_k = orig_row_ptrs[new_row]; + for (size_type k = rp_row_ptrs[row]; + k < size_type(rp_row_ptrs[row + 1]); ++k) { + rp_col_idxs[k] = orig_col_idxs[new_k]; + rp_vals[k] = orig_vals[new_k]; + new_k++; + } + } +} + + +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) +{ + row_permute_impl(exec, permutation_indices, orig, row_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); + Array inv_perm(*permutation_indices); + auto iperm = inv_perm.get_data(); + for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) { + iperm[perm[ind]] = ind; + } + + row_permute_impl(exec, &inv_perm, orig, row_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void column_permute_impl(const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto orig_vals = orig->get_const_values(); + auto cp_row_ptrs = column_permuted->get_row_ptrs(); + auto cp_col_idxs = column_permuted->get_col_idxs(); + auto cp_vals = column_permuted->get_values(); + auto num_nnz = orig->get_num_stored_elements(); + size_type num_rows = orig->get_size()[0]; + size_type num_cols = orig->get_size()[1]; + + for (size_type row = 0; row < num_rows; ++row) { + cp_row_ptrs[row] = orig_row_ptrs[row]; + for (size_type k = orig_row_ptrs[row]; + k < size_type(orig_row_ptrs[row + 1]); ++k) { + cp_col_idxs[k] = perm[orig_col_idxs[k]]; + cp_vals[k] = orig_vals[k]; + } + } + cp_row_ptrs[num_rows] = orig_row_ptrs[num_rows]; +} + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); + Array inv_perm(*permutation_indices); + auto iperm = inv_perm.get_data(); + for (size_type ind = 0; ind < inv_perm.get_num_elems(); ++ind) { + iperm[perm[ind]] = ind; + } + column_permute_impl(&inv_perm, orig, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Csr *orig, + matrix::Csr *column_permuted) +{ + column_permute_impl(permutation_indices, orig, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + + template void calculate_nonzeros_per_row(std::shared_ptr exec, const matrix::Csr *source, diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 82a20a8b1a4..a55a8b1d24f 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" +#include + + +#include #include #include #include @@ -43,9 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - namespace gko { namespace kernels { namespace reference { @@ -181,14 +182,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, - matrix::Dense *result) + matrix::Dense> *result) { - compute_dot(exec, x, x, result); - for (size_type i = 0; i < result->get_size()[0]; ++i) { - for (size_type j = 0; j < result->get_size()[1]; ++j) { - result->at(i, j) = sqrt(abs(result->at(i, j))); + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) = zero>(); + } + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) += squared_norm(x->at(i, j)); } } + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) = sqrt(result->at(0, j)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); @@ -196,8 +202,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); template void convert_to_coo(std::shared_ptr exec, - matrix::Coo *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Coo *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -227,8 +233,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Csr *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -259,8 +265,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_ell(std::shared_ptr exec, - matrix::Ell *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Ell *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -291,8 +297,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, - matrix::Hybrid *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Hybrid *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -346,8 +352,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sellp(std::shared_ptr exec, - matrix::Sellp *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::Sellp *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -407,8 +413,11 @@ void convert_to_sellp(std::shared_ptr exec, } } } - slice_sets[slice_num] = - slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + + if (slice_num > 0) { + slice_sets[slice_num] = + slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -417,8 +426,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_sparsity_csr(std::shared_ptr exec, - matrix::SparsityCsr *result, - const matrix::Dense *source) + const matrix::Dense *source, + matrix::SparsityCsr *result) { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -546,8 +555,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void transpose(std::shared_ptr exec, - matrix::Dense *trans, - const matrix::Dense *orig) + const matrix::Dense *orig, + matrix::Dense *trans) { for (size_type i = 0; i < orig->get_size()[0]; ++i) { for (size_type j = 0; j < orig->get_size()[1]; ++j) { @@ -561,8 +570,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, - matrix::Dense *trans, - const matrix::Dense *orig) + const matrix::Dense *orig, + matrix::Dense *trans) { for (size_type i = 0; i < orig->get_size()[0]; ++i) { for (size_type j = 0; j < orig->get_size()[1]; ++j) { @@ -574,6 +583,77 @@ void conj_transpose(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL); +template +void row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + row_permuted->at(i, j) = orig->at(perm[i], j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + column_permuted->at(i, j) = orig->at(i, perm[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *row_permuted) +{ + auto perm = permutation_indices->get_const_data(); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + row_permuted->at(perm[i], j) = orig->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array *permutation_indices, + const matrix::Dense *orig, + matrix::Dense *column_permuted) +{ + auto perm = permutation_indices->get_const_data(); + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + column_permuted->at(i, perm[j]) = orig->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL); + + } // namespace dense } // namespace reference } // namespace kernels diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 318969ea258..0f21a6c2f3a 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -106,8 +106,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Ell *source) + const matrix::Ell *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -130,8 +130,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Ell *source) + const matrix::Ell *source, + matrix::Csr *result) { const auto num_rows = source->get_size()[0]; const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index 7e0acc4bc50..74e126334e2 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -58,8 +58,8 @@ namespace hybrid { template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Hybrid *source) + const matrix::Hybrid *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -93,8 +93,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Hybrid *source) + const matrix::Hybrid *source, + matrix::Csr *result) { auto csr_val = result->get_values(); auto csr_col_idxs = result->get_col_idxs(); diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp index 85d2a705982..43e01b51fb1 100644 --- a/reference/matrix/sellp_kernels.cpp +++ b/reference/matrix/sellp_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -125,8 +125,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense(std::shared_ptr exec, - matrix::Dense *result, - const matrix::Sellp *source) + const matrix::Sellp *source, + matrix::Dense *result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -161,8 +161,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, - matrix::Csr *result, - const matrix::Sellp *source) + const matrix::Sellp *source, + matrix::Csr *result) { auto num_rows = source->get_size()[0]; auto slice_size = source->get_slice_size(); diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp index 42b4edd88a2..70ab3b15aff 100644 --- a/reference/matrix/sparsity_csr_kernels.cpp +++ b/reference/matrix/sparsity_csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -143,9 +143,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void remove_diagonal_elements(std::shared_ptr exec, - matrix::SparsityCsr *matrix, const IndexType *row_ptrs, - const IndexType *col_idxs) + const IndexType *col_idxs, + matrix::SparsityCsr *matrix) { auto num_rows = matrix->get_size()[0]; auto adj_ptrs = matrix->get_row_ptrs(); @@ -193,8 +193,8 @@ inline void convert_sparsity_to_csc(size_type num_rows, template void transpose_and_transform( std::shared_ptr exec, - matrix::SparsityCsr *trans, - const matrix::SparsityCsr *orig) + const matrix::SparsityCsr *orig, + matrix::SparsityCsr *trans) { auto trans_row_ptrs = trans->get_row_ptrs(); auto orig_row_ptrs = orig->get_const_row_ptrs(); @@ -216,10 +216,10 @@ void transpose_and_transform( template void transpose(std::shared_ptr exec, - matrix::SparsityCsr *trans, - const matrix::SparsityCsr *orig) + const matrix::SparsityCsr *orig, + matrix::SparsityCsr *trans) { - transpose_and_transform(exec, trans, orig); + transpose_and_transform(exec, orig, trans); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp new file mode 100644 index 00000000000..6a3a682e395 --- /dev/null +++ b/reference/preconditioner/isai_kernels.cpp @@ -0,0 +1,316 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/isai_kernels.hpp" + + +#include +#include + + +#include +#include +#include +#include + + +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The Isai preconditioner namespace. + * + * @ingroup isai + */ +namespace isai { + + +template +void forall_matching(const IndexType *fst, IndexType fst_size, + const IndexType *snd, IndexType snd_size, Callback cb) +{ + IndexType fst_idx{}; + IndexType snd_idx{}; + while (fst_idx < fst_size && snd_idx < snd_size) { + const auto fst_val = fst[fst_idx]; + const auto snd_val = snd[snd_idx]; + if (fst_val == snd_val) { + cb(fst_val, fst_idx, snd_idx); + } + // advance the smaller entrie(s) + fst_idx += (fst_val <= snd_val); + snd_idx += (fst_val >= snd_val); + } +} + + +template +void generic_generate(std::shared_ptr exec, + const matrix::Csr *mtx, + matrix::Csr *inverse_mtx, + IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs, + Callable trs_solve) +{ + /* + Consider: aiM := inverse_mtx; M := mtx + I := Identity matrix + e(i) := unit vector i (containing all zeros except for row i, which is one) + S := Sparsity pattern of the desired aiM + S(i) := Sparsity pattern of row i of aiM (Set of non-zero columns) + D(i) := M[S(i), S(i)] + aiM := approximate inverse of M + + Target: Solving (aiM * M = I)_{S} (aiM * M = I for the sparsity pattern S) + aiM[i, :] * D(i) = e(i)^T + <=> D(i)^T * aiM[i, :]^T = e(i) =^ Triangular system (Trs) + Solve Trs, fill in aiM row by row (coalesced access) + */ + const auto num_rows = mtx->get_size()[0]; + const auto m_row_ptrs = mtx->get_const_row_ptrs(); + const auto m_cols = mtx->get_const_col_idxs(); + const auto m_vals = mtx->get_const_values(); + const auto i_row_ptrs = inverse_mtx->get_const_row_ptrs(); + const auto i_cols = inverse_mtx->get_const_col_idxs(); + auto i_vals = inverse_mtx->get_values(); + // RHS for local trisystem + gko::Array rhs_array{exec, row_size_limit}; + auto rhs = rhs_array.get_data(); + // memory for dense trisystem + gko::Array trisystem_array{exec, + row_size_limit * row_size_limit}; + auto trisystem_ptr = trisystem_array.get_data(); + // stores the next free index in the excess rhs/solution + IndexType excess_rhs_begin{}; + // stores the next free non-zero index in the excess system + IndexType excess_nz_begin{}; + + for (size_type row = 0; row < num_rows; ++row) { + const auto i_begin = i_row_ptrs[row]; + const auto i_size = i_row_ptrs[row + 1] - i_begin; + excess_rhs_ptrs[row] = excess_rhs_begin; + excess_nz_ptrs[row] = excess_nz_begin; + + if (i_size <= row_size_limit) { + // short rows: treat directly as dense system + // we need this ugly workaround to get rid of a few + // warnings and compilation issues + auto trisystem = range>( + trisystem_ptr, static_cast(i_size), + static_cast(i_size), static_cast(i_size)); + std::fill_n(trisystem_ptr, i_size * i_size, zero()); + + for (size_type i = 0; i < i_size; ++i) { + const auto col = i_cols[i_begin + i]; + const auto m_begin = m_row_ptrs[col]; + const auto m_size = m_row_ptrs[col + 1] - m_begin; + forall_matching( + m_cols + m_begin, m_size, i_cols + i_begin, i_size, + [&](IndexType, IndexType m_idx, IndexType i_idx) { + trisystem(i, i_idx) = m_vals[m_idx + m_begin]; + }); + } + + // solve dense triangular system + trs_solve(trisystem, rhs); + + // write triangular solution to inverse + for (size_type i = 0; i < i_size; ++i) { + const auto new_val = rhs[i]; + const auto idx = i_begin + i; + // check for non-finite elements which should not be copied over + if (is_finite(new_val)) { + i_vals[idx] = new_val; + } else { + // ensure the preconditioner does not prevent convergence + i_vals[idx] = i_cols[idx] == row ? one() + : zero(); + } + } + } else { + // count non-zeros and dimension in the excess system + for (size_type i = 0; i < i_size; ++i) { + const auto col = i_cols[i_begin + i]; + const auto m_begin = m_row_ptrs[col]; + const auto m_size = m_row_ptrs[col + 1] - m_begin; + forall_matching(m_cols + m_begin, m_size, i_cols + i_begin, + i_size, [&](IndexType, IndexType, IndexType) { + ++excess_nz_begin; + }); + ++excess_rhs_begin; + } + } + } + excess_rhs_ptrs[num_rows] = excess_rhs_begin; + excess_nz_ptrs[num_rows] = excess_nz_begin; +} + + +template +void generate_tri_inverse(std::shared_ptr exec, + const matrix::Csr *mtx, + matrix::Csr *inverse_mtx, + IndexType *excess_rhs_ptrs, IndexType *excess_nz_ptrs, + bool lower) +{ + auto trs_solve = + [lower](const range> trisystem, + ValueType *rhs) { + const IndexType size = trisystem.length(0); + if (size <= 0) { + return; + } + // RHS is the identity: zero everywhere except for the diagonal + // entry + std::fill_n(rhs, size, zero()); + rhs[lower ? size - 1 : 0] = one(); + + // solve transposed triangular system + if (lower) { + for (auto col = size - 1; col >= 0; --col) { + const auto diag = trisystem(col, col); + const auto bot = rhs[col] / diag; + rhs[col] = bot; + // do a backwards substitution + for (auto row = col - 1; row >= 0; --row) { + rhs[row] -= bot * trisystem(col, row); + } + } + } else { + for (IndexType col = 0; col < size; ++col) { + const auto diag = trisystem(col, col); + const auto top = rhs[col] / diag; + rhs[col] = top; + // do a forward substitution + for (auto row = col + 1; row < size; ++row) { + rhs[row] -= top * trisystem(col, row); + } + } + } + }; + + generic_generate(exec, mtx, inverse_mtx, excess_rhs_ptrs, excess_nz_ptrs, + trs_solve); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); + + +template +void generate_excess_system(std::shared_ptr, + const matrix::Csr *input, + const matrix::Csr *inverse, + const IndexType *, const IndexType *, + matrix::Csr *excess_system, + matrix::Dense *excess_rhs) +{ + const auto num_rows = input->get_size()[0]; + const auto m_row_ptrs = input->get_const_row_ptrs(); + const auto m_cols = input->get_const_col_idxs(); + const auto m_vals = input->get_const_values(); + const auto i_row_ptrs = inverse->get_const_row_ptrs(); + const auto i_cols = inverse->get_const_col_idxs(); + const auto e_dim = excess_rhs->get_size()[0]; + auto e_row_ptrs = excess_system->get_row_ptrs(); + auto e_cols = excess_system->get_col_idxs(); + auto e_vals = excess_system->get_values(); + auto e_rhs = excess_rhs->get_values(); + IndexType e_block_begin{}; + IndexType e_nz{}; + + for (size_type row = 0; row < num_rows; ++row) { + const auto i_begin = i_row_ptrs[row]; + const auto i_size = i_row_ptrs[row + 1] - i_begin; + + if (i_size > row_size_limit) { + // count non-zeros and dimension in the excess system + for (size_type i = 0; i < i_size; ++i) { + // current row in the excess system + const auto e_row = e_block_begin + i; + const auto col = i_cols[i_begin + i]; + const auto m_begin = m_row_ptrs[col]; + const auto m_size = m_row_ptrs[col + 1] - m_begin; + // store row pointers: one row per non-zero of inverse row + e_row_ptrs[e_row] = e_nz; + // build right-hand side: identity row + e_rhs[e_row] = + row == col ? one() : zero(); + // build sparse block + forall_matching( + m_cols + m_begin, m_size, i_cols + i_begin, i_size, + [&](IndexType, IndexType m_idx, IndexType i_idx) { + // trisystem(i, i_idx) = m_vals[m_idx + m_begin] + // just in sparse + e_cols[e_nz] = i_idx + e_block_begin; + e_vals[e_nz] = m_vals[m_idx + m_begin]; + ++e_nz; + }); + } + e_block_begin += i_size; + } + } + e_row_ptrs[e_dim] = e_nz; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); + + +template +void scatter_excess_solution(std::shared_ptr, + const IndexType *excess_block_ptrs, + const matrix::Dense *excess_solution, + matrix::Csr *inverse) +{ + const auto num_rows = inverse->get_size()[0]; + auto excess_values = excess_solution->get_const_values(); + auto values = inverse->get_values(); + auto row_ptrs = inverse->get_const_row_ptrs(); + for (size_type row = 0; row < num_rows; ++row) { + const auto excess_begin = excess_values + excess_block_ptrs[row]; + const auto excess_end = excess_values + excess_block_ptrs[row + 1]; + auto values_begin = values + row_ptrs[row]; + std::copy(excess_begin, excess_end, values_begin); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index f2972965273..d72065452e0 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,9 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" +#include #include +#include #include -#include #include @@ -44,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/allocator.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "reference/components/matrix_operations.hpp" @@ -66,15 +68,9 @@ inline bool has_same_nonzero_pattern(const IndexType *prev_row_ptr, const IndexType *curr_row_ptr, const IndexType *next_row_ptr) { - if (next_row_ptr - curr_row_ptr != curr_row_ptr - prev_row_ptr) { - return false; - } - for (; curr_row_ptr < next_row_ptr; ++prev_row_ptr, ++curr_row_ptr) { - if (*curr_row_ptr != *prev_row_ptr) { - return false; - } - } - return true; + return std::distance(curr_row_ptr, next_row_ptr) == + std::distance(prev_row_ptr, curr_row_ptr) && + std::equal(curr_row_ptr, next_row_ptr, prev_row_ptr); } @@ -253,6 +249,24 @@ inline void transpose_block(IndexType block_size, const SourceValueType *from, } +template > +inline void conj_transpose_block(IndexType block_size, + const SourceValueType *from, + size_type from_stride, ResultValueType *to, + size_type to_stride, + ValueConverter converter = {}) noexcept +{ + for (IndexType i = 0; i < block_size; ++i) { + for (IndexType j = 0; j < block_size; ++j) { + to[i * to_stride + j] = conj(converter(from[i + j * from_stride])); + } + } +} + + template -inline bool validate_precision_reduction_feasibility(IndexType block_size, - const ValueType *block, - size_type stride) +inline bool validate_precision_reduction_feasibility( + std::shared_ptr exec, IndexType block_size, + const ValueType *block, size_type stride) { using gko::detail::float_traits; - std::vector tmp(block_size * block_size); - std::vector perm(block_size); + vector tmp(block_size * block_size, {}, exec); + vector perm(block_size, {}, exec); std::iota(begin(perm), end(perm), IndexType{0}); for (IndexType i = 0; i < block_size; ++i) { for (IndexType j = 0; j < block_size; ++j) { @@ -341,9 +355,9 @@ void generate(std::shared_ptr exec, const auto group_size = storage_scheme.get_group_size(); const auto cond = conditioning.get_data(); for (size_type g = 0; g < num_blocks; g += group_size) { - std::vector> block(group_size); - std::vector> perm(group_size); - std::vector pr_descriptors(group_size, uint32{} - 1); + vector> block(group_size, {}, exec); + vector> perm(group_size, {}, exec); + vector pr_descriptors(group_size, uint32{} - 1, exec); // extract group of blocks, invert them, figure out storage precision for (size_type b = 0; b < group_size; ++b) { if (b + g >= num_blocks) { @@ -373,16 +387,18 @@ void generate(std::shared_ptr exec, using preconditioner::detail::get_supported_storage_reductions; pr_descriptors[b] = get_supported_storage_reductions( accuracy, cond[g + b], - [&block_size, &block, &b] { + [&exec, &block_size, &block, &b] { using target = reduce_precision; return validate_precision_reduction_feasibility( - block_size, block[b].get_const_data(), block_size); + exec, block_size, block[b].get_const_data(), + block_size); }, - [&block_size, &block, &b] { + [&exec, &block_size, &block, &b] { using target = reduce_precision>; return validate_precision_reduction_feasibility( - block_size, block[b].get_const_data(), block_size); + exec, block_size, block[b].get_const_data(), + block_size); }); } else { pr_descriptors[b] = preconditioner::detail:: @@ -544,6 +560,78 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); +template +void transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + const auto ptrs = block_pointers.get_const_data(); + const auto prec = block_precisions.get_const_data(); + const size_type matrix_size = ptrs[num_blocks]; + + for (size_type i = 0; i < num_blocks; ++i) { + const auto group_ofs = storage_scheme.get_group_offset(i); + const auto block_ofs = storage_scheme.get_block_offset(i); + const auto block_stride = storage_scheme.get_stride(); + const auto group = blocks.get_const_data() + group_ofs; + auto out_group = out_blocks.get_data() + group_ofs; + const auto block_size = ptrs[i + 1] - ptrs[i]; + const auto p = prec ? prec[i] : precision_reduction(); + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, p, + transpose_block( + block_size, + reinterpret_cast(group) + block_ofs, + block_stride, + reinterpret_cast(out_group) + block_ofs, + block_stride)); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); + + +template +void conj_transpose_jacobi( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, const Array &block_precisions, + const Array &block_pointers, const Array &blocks, + const preconditioner::block_interleaved_storage_scheme + &storage_scheme, + Array &out_blocks) +{ + const auto ptrs = block_pointers.get_const_data(); + const auto prec = block_precisions.get_const_data(); + const size_type matrix_size = ptrs[num_blocks]; + + for (size_type i = 0; i < num_blocks; ++i) { + const auto group_ofs = storage_scheme.get_group_offset(i); + const auto block_ofs = storage_scheme.get_block_offset(i); + const auto block_stride = storage_scheme.get_stride(); + const auto group = blocks.get_const_data() + group_ofs; + auto out_group = out_blocks.get_data() + group_ofs; + const auto block_size = ptrs[i + 1] - ptrs[i]; + const auto p = prec ? prec[i] : precision_reduction(); + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, p, + conj_transpose_block( + block_size, + reinterpret_cast(group) + block_ofs, + block_stride, + reinterpret_cast(out_group) + block_ofs, + block_stride)); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); + + template void convert_to_dense( std::shared_ptr exec, size_type num_blocks, diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp new file mode 100644 index 00000000000..5142b9461fd --- /dev/null +++ b/reference/solver/bicg_kernels.cpp @@ -0,0 +1,140 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/bicg_kernels.hpp" + + +#include +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The BICG solver namespace. + * + * @ingroup bicg + */ +namespace bicg { + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense *b, matrix::Dense *r, + matrix::Dense *z, matrix::Dense *p, + matrix::Dense *q, matrix::Dense *prev_rho, + matrix::Dense *rho, matrix::Dense *r2, + matrix::Dense *z2, matrix::Dense *p2, + matrix::Dense *q2, + Array *stop_status) +{ + for (size_type j = 0; j < b->get_size()[1]; ++j) { + rho->at(j) = zero(); + prev_rho->at(j) = one(); + stop_status->get_data()[j].reset(); + } + for (size_type i = 0; i < b->get_size()[0]; ++i) { + for (size_type j = 0; j < b->get_size()[1]; ++j) { + r->at(i, j) = b->at(i, j); + r2->at(i, j) = b->at(i, j); + z->at(i, j) = p->at(i, j) = q->at(i, j) = zero(); + z2->at(i, j) = p2->at(i, j) = q2->at(i, j) = zero(); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); + + +template +void step_1(std::shared_ptr exec, + matrix::Dense *p, const matrix::Dense *z, + matrix::Dense *p2, const matrix::Dense *z2, + const matrix::Dense *rho, + const matrix::Dense *prev_rho, + const Array *stop_status) +{ + for (size_type i = 0; i < p->get_size()[0]; ++i) { + for (size_type j = 0; j < p->get_size()[1]; ++j) { + if (stop_status->get_const_data()[j].has_stopped()) { + continue; + } + if (prev_rho->at(j) == zero()) { + p->at(i, j) = z->at(i, j); + p2->at(i, j) = z2->at(i, j); + } else { + auto tmp = rho->at(j) / prev_rho->at(j); + p->at(i, j) = z->at(i, j) + tmp * p->at(i, j); + p2->at(i, j) = z2->at(i, j) + tmp * p2->at(i, j); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_1_KERNEL); + + +template +void step_2(std::shared_ptr exec, + matrix::Dense *x, matrix::Dense *r, + matrix::Dense *r2, const matrix::Dense *p, + const matrix::Dense *q, + const matrix::Dense *q2, + const matrix::Dense *beta, + const matrix::Dense *rho, + const Array *stop_status) +{ + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + if (stop_status->get_const_data()[j].has_stopped()) { + continue; + } + if (beta->at(j) != zero()) { + auto tmp = rho->at(j) / beta->at(j); + x->at(i, j) += tmp * p->at(i, j); + r->at(i, j) -= tmp * q->at(i, j); + r2->at(i, j) -= tmp * q2->at(i, j); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_STEP_2_KERNEL); + + +} // namespace bicg +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp index 415f0ee427a..29927d18953 100644 --- a/reference/solver/bicgstab_kernels.cpp +++ b/reference/solver/bicgstab_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,12 +32,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/bicgstab_kernels.hpp" + +#include + + #include #include #include -#include - namespace gko { namespace kernels { diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp index 616d95bcd7b..bf4625de9de 100644 --- a/reference/solver/cg_kernels.cpp +++ b/reference/solver/cg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp index b134d2d8513..f393bb4ecd5 100644 --- a/reference/solver/cgs_kernels.cpp +++ b/reference/solver/cgs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp index 252dada8123..24c758d1140 100644 --- a/reference/solver/fcg_kernels.cpp +++ b/reference/solver/fcg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index c5b7a2636d7..fd79e7d8574 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -55,28 +55,27 @@ namespace { template -void finish_arnoldi(matrix::Dense *next_krylov_basis, - matrix::Dense *krylov_bases, +void finish_arnoldi(size_type num_rows, matrix::Dense *krylov_bases, matrix::Dense *hessenberg_iter, size_type iter, const stopping_status *stop_status) { - for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) { + const auto krylov_bases_rowoffset = num_rows; + const auto next_krylov_rowoffset = (iter + 1) * krylov_bases_rowoffset; + for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { if (stop_status[i].has_stopped()) { continue; } for (size_type k = 0; k < iter + 1; ++k) { hessenberg_iter->at(k, i) = 0; - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { + for (size_type j = 0; j < num_rows; ++j) { hessenberg_iter->at(k, i) += - next_krylov_basis->at(j, i) * - krylov_bases->at(j, - next_krylov_basis->get_size()[1] * k + i); + krylov_bases->at(j + next_krylov_rowoffset, i) * + conj(krylov_bases->at(j + k * krylov_bases_rowoffset, i)); } - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { - next_krylov_basis->at(j, i) -= + for (size_type j = 0; j < num_rows; ++j) { + krylov_bases->at(j + next_krylov_rowoffset, i) -= hessenberg_iter->at(k, i) * - krylov_bases->at(j, - next_krylov_basis->get_size()[1] * k + i); + krylov_bases->at(j + k * krylov_bases_rowoffset, i); } } // for i in 1:iter @@ -85,20 +84,19 @@ void finish_arnoldi(matrix::Dense *next_krylov_basis, // end hessenberg_iter->at(iter + 1, i) = 0; - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { + for (size_type j = 0; j < num_rows; ++j) { hessenberg_iter->at(iter + 1, i) += - next_krylov_basis->at(j, i) * next_krylov_basis->at(j, i); + krylov_bases->at(j + next_krylov_rowoffset, i) * + krylov_bases->at(j + next_krylov_rowoffset, i); } hessenberg_iter->at(iter + 1, i) = sqrt(hessenberg_iter->at(iter + 1, i)); - // hessenberg(iter, iter + 1) = norm(next_krylov_basis) - for (size_type j = 0; j < next_krylov_basis->get_size()[0]; ++j) { - next_krylov_basis->at(j, i) /= hessenberg_iter->at(iter + 1, i); - krylov_bases->at(j, next_krylov_basis->get_size()[1] * (iter + 1) + - i) = next_krylov_basis->at(j, i); + // hessenberg(iter + 1, iter) = norm(krylov_bases) + for (size_type j = 0; j < num_rows; ++j) { + krylov_bases->at(j + next_krylov_rowoffset, i) /= + hessenberg_iter->at(iter + 1, i); } // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis // End of arnoldi } } @@ -114,27 +112,25 @@ void calculate_sin_and_cos(matrix::Dense *givens_sin, givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { - auto hypotenuse = sqrt(hessenberg_iter->at(iter, rhs) * - hessenberg_iter->at(iter, rhs) + - hessenberg_iter->at(iter + 1, rhs) * - hessenberg_iter->at(iter + 1, rhs)); - givens_cos->at(iter, rhs) = - abs(hessenberg_iter->at(iter, rhs)) / hypotenuse; - givens_sin->at(iter, rhs) = givens_cos->at(iter, rhs) * - hessenberg_iter->at(iter + 1, rhs) / - hessenberg_iter->at(iter, rhs); + auto this_hess = hessenberg_iter->at(iter, rhs); + auto next_hess = hessenberg_iter->at(iter + 1, rhs); + const auto scale = abs(this_hess) + abs(next_hess); + const auto hypotenuse = + scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + + abs(next_hess / scale) * abs(next_hess / scale)); + givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse; + givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse; } } template -void givens_rotation(matrix::Dense *next_krylov_basis, - matrix::Dense *givens_sin, +void givens_rotation(matrix::Dense *givens_sin, matrix::Dense *givens_cos, matrix::Dense *hessenberg_iter, size_type iter, const stopping_status *stop_status) { - for (size_type i = 0; i < next_krylov_basis->get_size()[1]; ++i) { + for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { if (stop_status[i].has_stopped()) { continue; } @@ -142,13 +138,13 @@ void givens_rotation(matrix::Dense *next_krylov_basis, auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); hessenberg_iter->at(j + 1, i) = - -givens_sin->at(j, i) * hessenberg_iter->at(j, i) + - givens_cos->at(j, i) * hessenberg_iter->at(j + 1, i); + -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + + conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); hessenberg_iter->at(j, i) = temp; // temp = cos(j)*hessenberg(j) + // sin(j)*hessenberg(j+1) - // hessenberg(j+1) = -sin(j)*hessenberg(j) + - // cos(j)*hessenberg(j+1) + // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + + // conj(cos(j))*hessenberg(j+1) // hessenberg(j) = temp; } @@ -159,7 +155,7 @@ void givens_rotation(matrix::Dense *next_krylov_basis, givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); hessenberg_iter->at(iter + 1, i) = zero(); // hessenberg(iter) = cos(iter)*hessenberg(iter) + - // sin(iter)*hessenberg(iter) + // sin(iter)*hessenberg(iter + 1) // hessenberg(iter+1) = 0 } } @@ -168,9 +164,8 @@ void givens_rotation(matrix::Dense *next_krylov_basis, template void calculate_next_residual_norm( matrix::Dense *givens_sin, matrix::Dense *givens_cos, - matrix::Dense *residual_norm, - matrix::Dense *residual_norm_collection, - const matrix::Dense *b_norm, size_type iter, + matrix::Dense> *residual_norm, + matrix::Dense *residual_norm_collection, size_type iter, const stopping_status *stop_status) { for (size_type i = 0; i < residual_norm->get_size()[1]; ++i) { @@ -178,11 +173,12 @@ void calculate_next_residual_norm( continue; } residual_norm_collection->at(iter + 1, i) = - -givens_sin->at(iter, i) * residual_norm_collection->at(iter, i); + -conj(givens_sin->at(iter, i)) * + residual_norm_collection->at(iter, i); residual_norm_collection->at(iter, i) = givens_cos->at(iter, i) * residual_norm_collection->at(iter, i); residual_norm->at(0, i) = - abs(residual_norm_collection->at(iter + 1, i)) / b_norm->at(0, i); + abs(residual_norm_collection->at(iter + 1, i)); } } @@ -216,13 +212,13 @@ void calculate_qy(const matrix::Dense *krylov_bases, matrix::Dense *before_preconditioner, const size_type *final_iter_nums) { + const auto krylov_bases_rowoffset = before_preconditioner->get_size()[0]; for (size_type k = 0; k < before_preconditioner->get_size()[1]; ++k) { for (size_type i = 0; i < before_preconditioner->get_size()[0]; ++i) { before_preconditioner->at(i, k) = zero(); for (size_type j = 0; j < final_iter_nums[k]; ++j) { before_preconditioner->at(i, k) += - krylov_bases->at( - i, j * before_preconditioner->get_size()[1] + k) * + krylov_bases->at(i + j * krylov_bases_rowoffset, k) * y->at(j, k); } } @@ -236,20 +232,13 @@ void calculate_qy(const matrix::Dense *krylov_bases, template void initialize_1(std::shared_ptr exec, const matrix::Dense *b, - matrix::Dense *b_norm, matrix::Dense *residual, matrix::Dense *givens_sin, matrix::Dense *givens_cos, Array *stop_status, size_type krylov_dim) { + using NormValueType = remove_complex; for (size_type j = 0; j < b->get_size()[1]; ++j) { - // Calculate b norm - b_norm->at(0, j) = zero(); - for (size_type i = 0; i < b->get_size()[0]; ++i) { - b_norm->at(0, j) += b->at(i, j) * b->at(i, j); - } - b_norm->at(0, j) = sqrt(b_norm->at(0, j)); - for (size_type i = 0; i < b->get_size()[0]; ++i) { residual->at(i, j) = b->at(i, j); } @@ -267,7 +256,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); template void initialize_2(std::shared_ptr exec, const matrix::Dense *residual, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, matrix::Dense *krylov_bases, Array *final_iter_nums, size_type krylov_dim) @@ -276,45 +265,29 @@ void initialize_2(std::shared_ptr exec, // Calculate residual norm residual_norm->at(0, j) = 0; for (size_type i = 0; i < residual->get_size()[0]; ++i) { - residual_norm->at(0, j) += residual->at(i, j) * residual->at(i, j); + residual_norm->at(0, j) += squared_norm(residual->at(i, j)); } residual_norm->at(0, j) = sqrt(residual_norm->at(0, j)); - - for (size_type i = 0; i < krylov_dim + 1; ++i) { - if (i == 0) { - residual_norm_collection->at(i, j) = residual_norm->at(0, j); - } else { - residual_norm_collection->at(i, j) = zero(); - } - } + residual_norm_collection->at(0, j) = residual_norm->at(0, j); for (size_type i = 0; i < residual->get_size()[0]; ++i) { krylov_bases->at(i, j) = residual->at(i, j) / residual_norm->at(0, j); } final_iter_nums->get_data()[j] = 0; } - - for (size_type j = residual->get_size()[1]; j < krylov_bases->get_size()[1]; - ++j) { - for (size_type i = 0; i < krylov_bases->get_size()[0]; ++i) { - krylov_bases->at(i, j) = zero(); - } - } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); template -void step_1(std::shared_ptr exec, - matrix::Dense *next_krylov_basis, +void step_1(std::shared_ptr exec, size_type num_rows, matrix::Dense *givens_sin, matrix::Dense *givens_cos, - matrix::Dense *residual_norm, + matrix::Dense> *residual_norm, matrix::Dense *residual_norm_collection, matrix::Dense *krylov_bases, - matrix::Dense *hessenberg_iter, - const matrix::Dense *b_norm, size_type iter, + matrix::Dense *hessenberg_iter, size_type iter, Array *final_iter_nums, const Array *stop_status) { @@ -323,12 +296,12 @@ void step_1(std::shared_ptr exec, (1 - stop_status->get_const_data()[i].has_stopped()); } - finish_arnoldi(next_krylov_basis, krylov_bases, hessenberg_iter, iter, + finish_arnoldi(num_rows, krylov_bases, hessenberg_iter, iter, stop_status->get_const_data()); - givens_rotation(next_krylov_basis, givens_sin, givens_cos, hessenberg_iter, - iter, stop_status->get_const_data()); + givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, + stop_status->get_const_data()); calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, - residual_norm_collection, b_norm, iter, + residual_norm_collection, iter, stop_status->get_const_data()); } diff --git a/reference/solver/ir_kernels.cpp b/reference/solver/ir_kernels.cpp index 48d6a9c219a..1febced33ad 100644 --- a/reference/solver/ir_kernels.cpp +++ b/reference/solver/ir_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp index c8f698ed711..b1678d06f83 100644 --- a/reference/solver/lower_trs_kernels.cpp +++ b/reference/solver/lower_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp index 56626e367bd..02f3666ecf4 100644 --- a/reference/solver/upper_trs_kernels.cpp +++ b/reference/solver/upper_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/stop/criterion_kernels.cpp b/reference/stop/criterion_kernels.cpp index e730aaa301b..050ebab01af 100644 --- a/reference/stop/criterion_kernels.cpp +++ b/reference/stop/criterion_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/stop/residual_norm_reduction_kernels.cpp b/reference/stop/residual_norm_kernels.cpp similarity index 71% rename from reference/stop/residual_norm_reduction_kernels.cpp rename to reference/stop/residual_norm_kernels.cpp index 1f3cb5b3fdf..fb968e0eae3 100644 --- a/reference/stop/residual_norm_reduction_kernels.cpp +++ b/reference/stop/residual_norm_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,42 +30,44 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/stop/residual_norm_reduction_kernels.hpp" +#include "core/stop/residual_norm_kernels.hpp" + + +#include #include #include #include - - -#include +#include namespace gko { namespace kernels { namespace reference { /** - * @brief The Residual norm reduction stopping criterion. + * @brief The Residual norm stopping criterion. * @ref resnorm * @ingroup resnorm */ -namespace residual_norm_reduction { +namespace residual_norm { template -void residual_norm_reduction(std::shared_ptr exec, - const matrix::Dense *tau, - const matrix::Dense *orig_tau, - remove_complex rel_residual_goal, - uint8 stoppingId, bool setFinalized, - Array *stop_status, - Array *device_storage, bool *all_converged, - bool *one_changed) +void residual_norm(std::shared_ptr exec, + const matrix::Dense *tau, + const matrix::Dense *orig_tau, + ValueType rel_residual_goal, uint8 stoppingId, + bool setFinalized, Array *stop_status, + Array *device_storage, bool *all_converged, + bool *one_changed) { + static_assert(is_complex_s::value == false, + "ValueType must not be complex in this function!"); *all_converged = true; *one_changed = false; for (size_type i = 0; i < tau->get_size()[1]; ++i) { - if (abs(tau->at(i)) < rel_residual_goal * abs(orig_tau->at(i))) { + if (tau->at(i) < rel_residual_goal * orig_tau->at(i)) { stop_status->get_data()[i].converge(stoppingId, setFinalized); *one_changed = true; } @@ -78,10 +80,11 @@ void residual_norm_reduction(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( + GKO_DECLARE_RESIDUAL_NORM_KERNEL); -} // namespace residual_norm_reduction +} // namespace residual_norm } // namespace reference } // namespace kernels } // namespace gko diff --git a/reference/test/CMakeLists.txt b/reference/test/CMakeLists.txt index 322bf38e6a7..b359d8146ed 100644 --- a/reference/test/CMakeLists.txt +++ b/reference/test/CMakeLists.txt @@ -1,4 +1,7 @@ +include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake) + add_subdirectory(base) +add_subdirectory(components) add_subdirectory(factorization) add_subdirectory(log) add_subdirectory(matrix) diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp index fcc89c69bcf..830b031e83e 100644 --- a/reference/test/base/combination.cpp +++ b/reference/test/base/combination.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,23 +39,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include "core/test/utils.hpp" + + namespace { +template class Combination : public ::testing::Test { protected: - using mtx = gko::matrix::Dense<>; + using Mtx = gko::matrix::Dense; Combination() : exec{gko::ReferenceExecutor::create()}, - coefficients{gko::initialize({1}, exec), - gko::initialize({2}, exec)}, - operators{gko::initialize({{2.0, 3.0}, {1.0, 4.0}}, exec), - gko::initialize({{3.0, 2.0}, {2.0, 0.0}}, exec)} + coefficients{gko::initialize({1}, exec), + gko::initialize({2}, exec)}, + operators{ + gko::initialize({I({2.0, 3.0}), I({1.0, 4.0})}, exec), + gko::initialize({I({3.0, 2.0}), I({2.0, 0.0})}, exec)} {} std::shared_ptr exec; @@ -63,40 +67,46 @@ class Combination : public ::testing::Test { std::vector> operators; }; +TYPED_TEST_CASE(Combination, gko::test::ValueTypes); + -TEST_F(Combination, AppliesToVector) +TYPED_TEST(Combination, AppliesToVector) { /* cmb = [ 8 7 ] [ 5 4 ] */ - auto cmb = gko::Combination<>::create(coefficients[0], operators[0], - coefficients[1], operators[1]); - auto x = gko::initialize({1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + auto cmb = gko::Combination::create( + this->coefficients[0], this->operators[0], this->coefficients[1], + this->operators[1]); + auto x = gko::initialize({1.0, 2.0}, this->exec); auto res = clone(x); cmb->apply(lend(x), lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({22.0, 13.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({22.0, 13.0}), r::value); } -TEST_F(Combination, AppliesLinearCombinationToVector) +TYPED_TEST(Combination, AppliesLinearCombinationToVector) { /* cmb = [ 8 7 ] [ 5 4 ] */ - auto cmb = gko::Combination<>::create(coefficients[0], operators[0], - coefficients[1], operators[1]); - auto alpha = gko::initialize({3.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto x = gko::initialize({1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + auto cmb = gko::Combination::create( + this->coefficients[0], this->operators[0], this->coefficients[1], + this->operators[1]); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); auto res = clone(x); cmb->apply(lend(alpha), lend(x), lend(beta), lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({65.0, 37.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({65.0, 37.0}), r::value); } diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp index d9f00665432..76d71734d12 100644 --- a/reference/test/base/composition.cpp +++ b/reference/test/base/composition.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,60 +39,371 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include "core/test/utils.hpp" + + namespace { +template +class DummyLinOp : public gko::EnableLinOp>, + public gko::EnableCreateMethod> { + friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + +public: + using value_type = ValueType; + + bool apply_uses_initial_guess() const override { return true; } + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override {} + + void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b, + const gko::LinOp *beta, gko::LinOp *x) const override + {} + + explicit DummyLinOp(std::shared_ptr exec) + : gko::EnableLinOp(exec) + {} + + explicit DummyLinOp(std::shared_ptr exec, + gko::dim<2> size) + : gko::EnableLinOp(exec, size) + {} +}; + + +template class Composition : public ::testing::Test { protected: - using mtx = gko::matrix::Dense<>; + using Mtx = gko::matrix::Dense; + using value_type = T; Composition() : exec{gko::ReferenceExecutor::create()}, - operators{gko::initialize({2.0, 1.0}, exec), - gko::initialize({{3.0, 2.0}}, exec)} + operators{ + gko::initialize(I({2.0, 1.0}), exec), + gko::initialize({I({3.0, 2.0})}, exec), + gko::initialize( + {I({-1.0, 1.0, 2.0}), I({5.0, -3.0, 0.0})}, exec), + gko::initialize( + {I({9.0, 4.0}), I({6.0, -2.0}), I({-3.0, 2.0})}, + exec), + gko::initialize({I({1.0, 0.0}), I({0.0, 1.0})}, exec), + gko::initialize({I({1.0, 0.0}), I({0.0, 1.0})}, exec)}, + identity{ + gko::initialize({I({1.0, 0.0}), I({0.0, 1.0})}, exec)}, + product{gko::initialize({I({-9.0, -2.0}), I({27.0, 26.0})}, + exec)} {} std::shared_ptr exec; std::vector> coefficients; std::vector> operators; + std::shared_ptr identity; + std::shared_ptr product; }; +TYPED_TEST_CASE(Composition, gko::test::ValueTypes); -TEST_F(Composition, AppliesToVector) + +TYPED_TEST(Composition, AppliesSingleToVector) +{ + /* + cmp = [ -9 -2 ] + [ 27 26 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->product); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({-13.0, 79.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesSingleLinearCombinationToVector) +{ + /* + cmp = [ -9 -2 ] + [ 27 26 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->product); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(alpha), lend(x), lend(beta), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({-40.0, 235.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesToVector) { /* cmp = [ 2 ] * [ 3 2 ] [ 1 ] */ - auto cmp = gko::Composition<>::create(operators[0], operators[1]); - auto x = gko::initialize({1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->operators[0], + this->operators[1]); + auto x = gko::initialize({1.0, 2.0}, this->exec); auto res = clone(x); cmp->apply(lend(x), lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({14.0, 7.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({14.0, 7.0}), r::value); } -TEST_F(Composition, AppliesLinearCombinationToVector) +TYPED_TEST(Composition, AppliesLinearCombinationToVector) { /* cmp = [ 2 ] * [ 3 2 ] [ 1 ] */ - auto cmp = gko::Composition<>::create(operators[0], operators[1]); - auto alpha = gko::initialize({3.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto x = gko::initialize({1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->operators[0], + this->operators[1]); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(alpha), lend(x), lend(beta), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({41.0, 19.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesLongerToVector) +{ + /* + cmp = [ 2 ] * [ 3 2 ] * [ -9 -2 ] + [ 1 ] [ 27 26 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create( + this->operators[0], this->operators[1], this->product); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({238.0, 119.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesLongerLinearCombinationToVector) +{ + /* + cmp = [ 2 ] * [ 3 2 ] * [ -9 -2 ] + [ 1 ] [ 27 26 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create( + this->operators[0], this->operators[1], this->product); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(alpha), lend(x), lend(beta), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({713.0, 355.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesLongestToVector) +{ + /* + cmp = [ 2 ] * [ 3 2 ] * [ -1 1 2 ] * [ 9 4 ] * [ 1 0 ]^2 + [ 1 ] [ 5 -3 0 ] [ 6 -2 ] [ 0 1 ] + [ -3 2 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->operators.begin(), + this->operators.end()); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({238.0, 119.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesLongestLinearCombinationToVector) +{ + /* + cmp = [ 2 ] * [ 3 2 ] * [ -1 1 2 ] * [ 9 4 ] * [ 1 0 ]^2 + [ 1 ] [ 5 -3 0 ] [ 6 -2 ] [ 0 1 ] + [ -3 2 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->operators.begin(), + this->operators.end()); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); auto res = clone(x); cmp->apply(lend(alpha), lend(x), lend(beta), lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({41.0, 19.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({713.0, 355.0}), r::value); +} + + +TYPED_TEST(Composition, AppliesLongestToVectorMultipleRhs) +{ + /* + cmp = [ 2 ] * [ 3 2 ] * [ -1 1 2 ] * [ 9 4 ] * [ 1 0 ]^2 + [ 1 ] [ 5 -3 0 ] [ 6 -2 ] [ 0 1 ] + [ -3 2 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->operators.begin(), + this->operators.end()); + auto x = clone(this->identity); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({{54.0, 92.0}, {27.0, 46.0}}), + r::value); +} + + +TYPED_TEST(Composition, AppliesLongestLinearCombinationToVectorMultipleRhs) +{ + /* + cmp = [ 2 ] * [ 3 2 ] * [ -1 1 2 ] * [ 9 4 ] * [ 1 0 ]^2 + [ 1 ] [ 5 -3 0 ] [ 6 -2 ] [ 0 1 ] + [ -3 2 ] + */ + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Composition::create(this->operators.begin(), + this->operators.end()); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = clone(this->identity); + auto res = clone(x); + + cmp->apply(lend(alpha), lend(x), lend(beta), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({{161.0, 276.0}, {81.0, 137.0}}), + r::value); +} + + +TYPED_TEST(Composition, AppliesToVectorWithInitialGuess) +{ + /* + cmp = I * DummyLinOp * I + */ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto cmp = gko::Composition::create( + this->identity, + DummyLinOp::create(this->exec, this->identity->get_size()), + this->identity); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({1.0, 2.0}), 0); +} + + +TYPED_TEST(Composition, AppliesToVectorWithInitialGuess2) +{ + /* + cmp = I * DummyLinOp(2x3) * DummyLinOp(3x2) * I + */ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto size1 = gko::dim<2>(3, 2); + auto size2 = gko::dim<2>(2, 3); + auto cmp = gko::Composition::create( + this->identity, DummyLinOp::create(this->exec, size2), + DummyLinOp::create(this->exec, size1), this->identity); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({0.0, 0.0}), 0); +} + + +TYPED_TEST(Composition, AppliesToVectorWithInitialGuess3) +{ + /* + cmp = I * DummyLinOp + */ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto cmp = gko::Composition::create( + DummyLinOp::create(this->exec, this->identity->get_size()), + this->identity); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({1.0, 2.0}), 0); +} + + +TYPED_TEST(Composition, AppliesToVectorWithInitialGuess4) +{ + /* + cmp = I * DummyLinOp(2x3) * DummyLinOp(3x2) + */ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto size1 = gko::dim<2>(3, 2); + auto size2 = gko::dim<2>(2, 3); + auto cmp = gko::Composition::create( + this->identity, DummyLinOp::create(this->exec, size2), + DummyLinOp::create(this->exec, size1)); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({0.0, 0.0}), 0); +} + + +TYPED_TEST(Composition, AppliesToVectorWithInitialGuess5) +{ + /* + cmp = DummyLinOp(2x3) * DummyLinOp(3x2) * I + */ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto size1 = gko::dim<2>(3, 2); + auto size2 = gko::dim<2>(2, 3); + auto cmp = gko::Composition::create( + DummyLinOp::create(this->exec, size2), + DummyLinOp::create(this->exec, size1), this->identity); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = clone(x); + + cmp->apply(lend(x), lend(res)); + + GKO_ASSERT_MTX_NEAR(res, l({1.0, 2.0}), 0); } diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp index f265776d935..fe1cc8692fe 100644 --- a/reference/test/base/perturbation.cpp +++ b/reference/test/base/perturbation.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -42,21 +42,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class Perturbation : public ::testing::Test { protected: - using mtx = gko::matrix::Dense<>; + using Mtx = gko::matrix::Dense; Perturbation() : exec{gko::ReferenceExecutor::create()}, - basis{gko::initialize({2.0, 1.0}, exec)}, - projector{gko::initialize({{3.0, 2.0}}, exec)}, - scalar{gko::initialize({2.0}, exec)} + basis{gko::initialize({2.0, 1.0}, exec)}, + projector{gko::initialize({I({3.0, 2.0})}, exec)}, + scalar{gko::initialize({2.0}, exec)} {} std::shared_ptr exec; @@ -65,72 +66,80 @@ class Perturbation : public ::testing::Test { std::shared_ptr scalar; }; +TYPED_TEST_CASE(Perturbation, gko::test::ValueTypes); -TEST_F(Perturbation, AppliesToVector) + +TYPED_TEST(Perturbation, AppliesToVector) { /* cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - auto cmp = gko::Perturbation<>::create(scalar, basis, projector); - auto x = gko::initialize({1.0, 2.0}, exec); - auto res = mtx::create_with_config_of(gko::lend(x)); + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Perturbation::create(this->scalar, this->basis, + this->projector); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = Mtx::create_with_config_of(gko::lend(x)); cmp->apply(gko::lend(x), gko::lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({29.0, 16.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({29.0, 16.0}), r::value); } -TEST_F(Perturbation, AppliesLinearCombinationToVector) +TYPED_TEST(Perturbation, AppliesLinearCombinationToVector) { /* cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - auto cmp = gko::Perturbation<>::create(scalar, basis, projector); - auto alpha = gko::initialize({3.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto x = gko::initialize({1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Perturbation::create(this->scalar, this->basis, + this->projector); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); auto res = gko::clone(x); cmp->apply(gko::lend(alpha), gko::lend(x), gko::lend(beta), gko::lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({86.0, 46.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({86.0, 46.0}), r::value); } -TEST_F(Perturbation, ConstructionByBasisAppliesToVector) +TYPED_TEST(Perturbation, ConstructionByBasisAppliesToVector) { /* cmp = I + 2 * [ 2 ] * [ 2 1 ] [ 1 ] */ - auto cmp = gko::Perturbation<>::create(scalar, basis); - auto x = gko::initialize({1.0, 2.0}, exec); - auto res = mtx::create_with_config_of(gko::lend(x)); + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Perturbation::create(this->scalar, this->basis); + auto x = gko::initialize({1.0, 2.0}, this->exec); + auto res = Mtx::create_with_config_of(gko::lend(x)); cmp->apply(gko::lend(x), gko::lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({17.0, 10.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({17.0, 10.0}), r::value); } -TEST_F(Perturbation, ConstructionByBasisAppliesLinearCombinationToVector) +TYPED_TEST(Perturbation, ConstructionByBasisAppliesLinearCombinationToVector) { /* cmp = I + 2 * [ 2 ] * [ 2 1 ] [ 1 ] */ - auto cmp = gko::Perturbation<>::create(scalar, basis); - auto alpha = gko::initialize({3.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto x = gko::initialize({1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + auto cmp = gko::Perturbation::create(this->scalar, this->basis); + auto alpha = gko::initialize({3.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({1.0, 2.0}, this->exec); auto res = gko::clone(x); cmp->apply(gko::lend(alpha), gko::lend(x), gko::lend(beta), gko::lend(res)); - GKO_ASSERT_MTX_NEAR(res, l({50.0, 28.0}), 1e-15); + GKO_ASSERT_MTX_NEAR(res, l({50.0, 28.0}), r::value); } diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt new file mode 100644 index 00000000000..9c1dca5bcfa --- /dev/null +++ b/reference/test/components/CMakeLists.txt @@ -0,0 +1,3 @@ +ginkgo_create_test(fill_array) +ginkgo_create_test(precision_conversion) +ginkgo_create_test(prefix_sum) diff --git a/reference/test/components/fill_array.cpp b/reference/test/components/fill_array.cpp new file mode 100644 index 00000000000..51ec5d8dd09 --- /dev/null +++ b/reference/test/components/fill_array.cpp @@ -0,0 +1,84 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/fill_array.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class FillArray : public ::testing::Test { +protected: + using value_type = T; + FillArray() + : ref(gko::ReferenceExecutor::create()), + total_size(6344), + expected(ref, total_size), + vals(ref, total_size) + { + std::fill_n(expected.get_data(), total_size, T(6453)); + } + + std::shared_ptr ref; + gko::size_type total_size; + gko::Array expected; + gko::Array vals; +}; + +TYPED_TEST_CASE(FillArray, gko::test::ValueAndIndexTypes); + + +TYPED_TEST(FillArray, EqualsReference) +{ + using T = typename TestFixture::value_type; + gko::kernels::reference::components::fill_array( + this->ref, this->vals.get_data(), this->total_size, T(6453)); + GKO_ASSERT_ARRAY_EQ(this->vals, this->expected); +} + + +} // namespace diff --git a/reference/test/components/precision_conversion.cpp b/reference/test/components/precision_conversion.cpp new file mode 100644 index 00000000000..10c96e82f23 --- /dev/null +++ b/reference/test/components/precision_conversion.cpp @@ -0,0 +1,150 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +class PrecisionConversion : public ::testing::Test { +protected: + PrecisionConversion() + : ref(gko::ReferenceExecutor::create()), + rand(293), + total_size(42793), + vals(ref, total_size), + cvals(ref, total_size), + vals2(ref, 1), + expected_float(ref, 1), + expected_double(ref, 1) + { + auto maxval = 1e10f; + std::uniform_real_distribution dist(-maxval, maxval); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + cvals.get_data()[i] = {dist(rand), dist(rand)}; + } + gko::uint64 rawdouble{0x4218888000889111ULL}; + gko::uint32 rawfloat{0x50c44400UL}; + gko::uint64 rawrounded{0x4218888000000000ULL}; + std::memcpy(vals2.get_data(), &rawdouble, sizeof(double)); + std::memcpy(expected_float.get_data(), &rawfloat, sizeof(float)); + std::memcpy(expected_double.get_data(), &rawrounded, sizeof(double)); + } + + std::shared_ptr ref; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array vals2; + gko::Array expected_float; + gko::Array expected_double; + gko::Array> cvals; +}; + + +TEST_F(PrecisionConversion, ConvertsReal) +{ + gko::Array tmp; + gko::Array out; + + tmp = vals; + out = tmp; + + GKO_ASSERT_ARRAY_EQ(vals, out); +} + + +TEST_F(PrecisionConversion, ConversionRounds) +{ + gko::Array tmp; + gko::Array out; + + tmp = vals2; + out = tmp; + + GKO_ASSERT_ARRAY_EQ(tmp, expected_float); + GKO_ASSERT_ARRAY_EQ(out, expected_double); +} + + +TEST_F(PrecisionConversion, ConvertsRealWithSetExecutor) +{ + gko::Array tmp{ref}; + gko::Array out{ref}; + + tmp = vals; + out = tmp; + + GKO_ASSERT_ARRAY_EQ(vals, out); +} + + +TEST_F(PrecisionConversion, ConvertsRealFromView) +{ + gko::Array tmp{ref}; + gko::Array out{ref}; + + tmp = gko::Array::view(ref, vals.get_num_elems(), vals.get_data()); + out = tmp; + + GKO_ASSERT_ARRAY_EQ(vals, out); +} + + +TEST_F(PrecisionConversion, ConvertsComplex) +{ + gko::Array> tmp; + gko::Array> out; + + tmp = cvals; + out = tmp; + + GKO_ASSERT_ARRAY_EQ(cvals, out); +} + + +} // namespace diff --git a/reference/test/components/prefix_sum.cpp b/reference/test/components/prefix_sum.cpp new file mode 100644 index 00000000000..2766326bc4a --- /dev/null +++ b/reference/test/components/prefix_sum.cpp @@ -0,0 +1,77 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include +#include +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class PrefixSum : public ::testing::Test { +protected: + using index_type = T; + PrefixSum() + : exec(gko::ReferenceExecutor::create()), + vals{3, 5, 6, 7, 1, 5, 9, 7, 2, 0, 5}, + expected{0, 3, 8, 14, 21, 22, 27, 36, 43, 45, 45} + {} + + std::shared_ptr exec; + std::vector vals; + std::vector expected; +}; + +TYPED_TEST_CASE(PrefixSum, gko::test::IndexTypes); + + +TYPED_TEST(PrefixSum, Works) +{ + gko::kernels::reference::components::prefix_sum( + this->exec, this->vals.data(), this->vals.size()); + + ASSERT_EQ(this->vals, this->expected); +} + + +} // namespace diff --git a/reference/test/factorization/CMakeLists.txt b/reference/test/factorization/CMakeLists.txt index 36c21b93eea..b52c2d938d7 100644 --- a/reference/test/factorization/CMakeLists.txt +++ b/reference/test/factorization/CMakeLists.txt @@ -1 +1,3 @@ +ginkgo_create_test(par_ict_kernels) ginkgo_create_test(par_ilu_kernels) +ginkgo_create_test(par_ilut_kernels) diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp new file mode 100644 index 00000000000..9be9045492b --- /dev/null +++ b/reference/test/factorization/par_ict_kernels.cpp @@ -0,0 +1,390 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/factorization/par_ict_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +class DummyLinOp : public gko::EnableLinOp, + public gko::EnableCreateMethod { +public: + DummyLinOp(std::shared_ptr exec, + gko::dim<2> size = gko::dim<2>{}) + : EnableLinOp(exec, size) + {} + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override {} + + void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b, + const gko::LinOp *beta, gko::LinOp *x) const override + {} +}; + + +template +class ParIct : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using factorization_type = + gko::factorization::ParIct; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + + ParIct() + : ref(gko::ReferenceExecutor::create()), + exec(std::static_pointer_cast(ref)), + identity(gko::initialize( + {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)), + lower_tri(gko::initialize( + {{1., 0., 0.}, {1., 1., 0.}, {1., 1., 1.}}, ref)), + upper_tri(gko::initialize( + {{2., 1., 1.}, {0., -3., 1.}, {0., 0., 4.}}, ref)), + mtx_system(gko::initialize({{9., 0., -6., 3.}, + {0., 36., 18., 24.}, + {-6., 18., 17., 14.}, + {-3., 24., 14., 18.}}, + ref)), + mtx_init(gko::initialize({{9., 0., -6., 3.}, + {0., 0., 18., 24.}, + {-6., 18., 17., 14.}, + {-3., 24., 14., 18.}}, + ref)), + mtx_l_system(gko::initialize({{1., 0., 0., 0.}, + {0., 1., 0., 0.}, + {1., 1., 1., 0.}, + {1., 1., 0., 1.}}, + ref)), + mtx_l(gko::initialize({{1., 0., 0., 0.}, + {1., 2., 0., 0.}, + {0., 0., 3., 0.}, + {-2., 0., -3., 4.}}, + ref)), + mtx_llt(gko::initialize({{1., 1., 0., -2.}, + {1., 5., 0., -2.}, + {0., 0., 9., -9.}, + {-2., -2., -9., 29.}}, + ref)), + mtx_l_init_expect(gko::initialize( + {{3., 0., 0., 0.}, + {0., 1., 0., 0.}, + {-6., 18., static_cast(sqrt(17.)), 0.}, + {-3., 24., 14., static_cast(sqrt(18.))}}, + ref)), + mtx_l_add_expect(gko::initialize({{1., 0., 0., 0.}, + {1., 2., 0., 0.}, + {-6., 9., 3., 0.}, + {-2., 13., -3., 4.}}, + ref)), + mtx_l_it_expect(gko::initialize({{3., 0., 0., 0.}, + {0., 6., 0., 0.}, + {-2., 3., 2., 0.}, + {-1., 4., 0., 1.}}, + ref)), + mtx_l_small_expect(gko::initialize( + {{3., 0., 0., 0.}, + {0., 6., 0., 0.}, + {-2., 3., 2., 0.}, + {0., 4., 0., static_cast(sqrt(2.))}}, + ref)), + mtx_l_large_expect(gko::initialize({{3., 0., 0., 0.}, + {0., 6., 0., 0.}, + {-2., 3., 2., 0.}, + {-1., 4., 0., 1.}}, + ref)), + fact_fact(factorization_type::build().on(exec)), + tol{r::value} + {} + + std::shared_ptr ref; + std::shared_ptr exec; + std::shared_ptr identity; + std::shared_ptr lower_tri; + std::shared_ptr upper_tri; + std::shared_ptr mtx_system; + std::unique_ptr mtx_l_system; + std::unique_ptr mtx_init; + std::unique_ptr mtx_l; + std::unique_ptr mtx_llt; + std::unique_ptr mtx_l_init_expect; + std::unique_ptr mtx_l_add_expect; + std::unique_ptr mtx_l_it_expect; + std::unique_ptr mtx_l_small_expect; + std::unique_ptr mtx_l_large_expect; + std::unique_ptr fact_fact; + gko::remove_complex tol; +}; + +TYPED_TEST_CASE(ParIct, gko::test::ValueIndexTypes); + + +TYPED_TEST(ParIct, KernelInitializeRowPtrsL) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size()); + auto row_ptrs = res_mtx_l->get_const_row_ptrs(); + + gko::kernels::reference::factorization::initialize_row_ptrs_l( + this->ref, this->mtx_system.get(), res_mtx_l->get_row_ptrs()); + + ASSERT_EQ(row_ptrs[0], 0); + ASSERT_EQ(row_ptrs[1], 1); + ASSERT_EQ(row_ptrs[2], 2); + ASSERT_EQ(row_ptrs[3], 5); + ASSERT_EQ(row_ptrs[4], 9); +} + + +TYPED_TEST(ParIct, KernelInitializeL) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size(), 9); + auto row_ptrs = res_mtx_l->get_const_row_ptrs(); + + gko::kernels::reference::factorization::initialize_row_ptrs_l( + this->ref, this->mtx_init.get(), res_mtx_l->get_row_ptrs()); + gko::kernels::reference::factorization::initialize_l( + this->ref, this->mtx_init.get(), res_mtx_l.get(), true); + + GKO_ASSERT_MTX_NEAR(res_mtx_l, this->mtx_l_init_expect, this->tol); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, this->mtx_l_init_expect); +} + + +TYPED_TEST(ParIct, KernelAddCandidates) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size()); + + gko::kernels::reference::par_ict_factorization::add_candidates( + this->ref, this->mtx_llt.get(), this->mtx_system.get(), + this->mtx_l.get(), res_mtx_l.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, this->mtx_l_add_expect); + GKO_ASSERT_MTX_NEAR(res_mtx_l, this->mtx_l_add_expect, this->tol); +} + + +TYPED_TEST(ParIct, KernelComputeLU) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + auto mtx_l_coo = Coo::create(this->exec, this->mtx_system->get_size()); + this->mtx_l_system->convert_to(mtx_l_coo.get()); + + gko::kernels::reference::par_ict_factorization::compute_factor( + this->ref, this->mtx_system.get(), this->mtx_l_system.get(), + mtx_l_coo.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx_l_system, this->mtx_l_it_expect, this->tol); +} + + +TYPED_TEST(ParIct, ThrowNotSupportedForWrongLinOp) +{ + auto lin_op = DummyLinOp::create(this->ref); + + ASSERT_THROW(this->fact_fact->generate(gko::share(lin_op)), + gko::NotSupported); +} + + +TYPED_TEST(ParIct, ThrowDimensionMismatch) +{ + using Csr = typename TestFixture::Csr; + auto matrix = Csr::create(this->ref, gko::dim<2>{2, 3}, 4); + + ASSERT_THROW(this->fact_fact->generate(gko::share(matrix)), + gko::DimensionMismatch); +} + + +TYPED_TEST(ParIct, SetStrategies) +{ + using Csr = typename TestFixture::Csr; + using factorization_type = typename TestFixture::factorization_type; + auto l_strategy = std::make_shared(); + auto lt_strategy = std::make_shared(); + + auto factory = factorization_type::build() + .with_l_strategy(l_strategy) + .with_lt_strategy(lt_strategy) + .on(this->ref); + auto fact = factory->generate(this->mtx_system); + + ASSERT_EQ(factory->get_parameters().l_strategy, l_strategy); + ASSERT_EQ(fact->get_l_factor()->get_strategy()->get_name(), + l_strategy->get_name()); + ASSERT_EQ(factory->get_parameters().lt_strategy, lt_strategy); + ASSERT_EQ(fact->get_lt_factor()->get_strategy()->get_name(), + lt_strategy->get_name()); +} + + +TYPED_TEST(ParIct, IsConsistentWithComposition) +{ + auto fact = this->fact_fact->generate(this->mtx_system); + + auto lin_op_l_factor = + static_cast(gko::lend(fact->get_l_factor())); + auto lin_op_lt_factor = + static_cast(gko::lend(fact->get_lt_factor())); + auto first_operator = gko::lend(fact->get_operators()[0]); + auto second_operator = gko::lend(fact->get_operators()[1]); + + ASSERT_EQ(lin_op_l_factor, first_operator); + ASSERT_EQ(lin_op_lt_factor, second_operator); +} + + +TYPED_TEST(ParIct, GenerateIdentity) +{ + auto fact = this->fact_fact->generate(this->identity); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), this->identity, this->tol); +} + + +TYPED_TEST(ParIct, GenerateDenseIdentity) +{ + using Dense = typename TestFixture::Dense; + auto dense_id = Dense::create(this->exec, this->identity->get_size()); + this->identity->convert_to(dense_id.get()); + auto fact = this->fact_fact->generate(gko::share(dense_id)); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), this->identity, this->tol); +} + + +TYPED_TEST(ParIct, GenerateWithExactSmallLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + using Csr = typename TestFixture::Csr; + auto fact = factorization_type::build() + .with_approximate_select(false) + .with_fill_in_limit(0.6) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), + gko::as(this->mtx_l_small_expect->transpose()), + this->tol); +} + + +TYPED_TEST(ParIct, GenerateWithApproxSmallLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + using Csr = typename TestFixture::Csr; + auto fact = factorization_type::build() + .with_approximate_select(true) + .with_fill_in_limit(0.6) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), + gko::as(this->mtx_l_small_expect->transpose()), + this->tol); +} + + +TYPED_TEST(ParIct, GenerateWithExactLargeLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + using Csr = typename TestFixture::Csr; + auto fact = factorization_type::build() + .with_approximate_select(false) + .with_fill_in_limit(1.2) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), + gko::as(this->mtx_l_large_expect->transpose()), + this->tol); +} + + +TYPED_TEST(ParIct, GenerateWithApproxLargeLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + using Csr = typename TestFixture::Csr; + auto fact = factorization_type::build() + .with_approximate_select(true) + .with_fill_in_limit(1.2) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_lt_factor(), + gko::as(this->mtx_l_large_expect->transpose()), + this->tol); +} + + +} // namespace diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp index f64895a1613..b24309de53b 100644 --- a/reference/test/factorization/par_ilu_kernels.cpp +++ b/reference/test/factorization/par_ilu_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -47,8 +48,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/factorization/factorization_kernels.hpp" #include "core/factorization/par_ilu_kernels.hpp" -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { @@ -71,17 +73,25 @@ class DummyLinOp : public gko::EnableLinOp, }; +template class ParIlu : public ::testing::Test { protected: - using value_type = gko::default_precision; - using index_type = gko::int32; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; using Dense = gko::matrix::Dense; using Coo = gko::matrix::Coo; using Csr = gko::matrix::Csr; + using par_ilu_type = gko::factorization::ParIlu; ParIlu() : ref(gko::ReferenceExecutor::create()), exec(std::static_pointer_cast(ref)), // clang-format off + empty_csr(gko::initialize( + {{0., 0., 0.}, + {0., 0., 0.}, + {0., 0., 0.}}, exec)), identity(gko::initialize( {{1., 0., 0.}, {0., 1., 0.}, @@ -107,6 +117,19 @@ class ParIlu : public ::testing::Test { {{4., 6., 8.}, {0., -1., 1.}, {0., 0., -1.5}}, exec)), + mtx_small2(gko::initialize( + {{8., 8., 0}, + {2., 0., 5.}, + {1., 1., 1}}, exec)), + mtx_csr_small2(nullptr), + small2_l_expected(gko::initialize( + {{1., 0., 0}, + {.25, 1., 0.}, + {.125, 0., 1}}, exec)), + small2_u_expected(gko::initialize( + {{8., 8., 0}, + {0., -2., 5.}, + {0., 0., 1}}, exec)), mtx_big(gko::initialize({{1., 1., 1., 0., 1., 3.}, {1., 2., 2., 0., 2., 0.}, {0., 2., 3., 3., 3., 5.}, @@ -128,21 +151,46 @@ class ParIlu : public ::testing::Test { {0., 0., 0., 0., 5., -15.}, {0., 0., 0., 0., 0., 6.}}, exec)), + mtx_big_nodiag(gko::initialize({{1., 1., 1., 0., 1., 3.}, + {1., 2., 2., 0., 2., 0.}, + {0., 2., 0., 3., 3., 5.}, + {1., 0., 3., 4., 4., 4.}, + {1., 2., 0., 4., 1., 6.}, + {0., 2., 3., 4., 5., 8.}}, + exec)), + big_nodiag_l_expected(gko::initialize( + {{1., 0., 0., 0., 0., 0.}, + {1., 1., 0., 0., 0., 0.}, + {0., 2., 1., 0., 0., 0.}, + {1., 0., -1., 1., 0., 0.}, + {1., 1., 0., 0.571428571428571, 1., 0.}, + {0., 2., -0.5, 0.785714285714286, -0.108695652173913, 1.}}, + exec)), + big_nodiag_u_expected(gko::initialize( + {{1., 1., 1., 0., 1., 3.}, + {0., 1., 1., 0., 1., 0.}, + {0., 0., -2., 3., 1., 5.}, + {0., 0., 0., 7., 4., 6.}, + {0., 0., 0., 0., -3.28571428571429, -0.428571428571429}, + {0., 0., 0., 0., 0., 5.73913043478261}}, + exec)), // clang-format on ilu_factory_skip( - gko::factorization::ParIlu<>::build().with_skip_sorting(true).on( - exec)), + par_ilu_type::build().with_skip_sorting(true).on(exec)), ilu_factory_sort( - gko::factorization::ParIlu<>::build().with_skip_sorting(false).on( - exec)) + par_ilu_type::build().with_skip_sorting(false).on(exec)) { auto tmp_csr = Csr::create(exec); mtx_small->convert_to(gko::lend(tmp_csr)); mtx_csr_small = std::move(tmp_csr); + auto tmp_csr2 = Csr::create(exec); + mtx_small2->convert_to(gko::lend(tmp_csr2)); + mtx_csr_small2 = std::move(tmp_csr2); } std::shared_ptr ref; std::shared_ptr exec; + std::shared_ptr empty_csr; std::shared_ptr identity; std::shared_ptr lower_triangular; std::shared_ptr upper_triangular; @@ -150,28 +198,127 @@ class ParIlu : public ::testing::Test { std::shared_ptr mtx_csr_small; std::shared_ptr small_l_expected; std::shared_ptr small_u_expected; + std::shared_ptr mtx_small2; + std::shared_ptr mtx_csr_small2; + std::shared_ptr small2_l_expected; + std::shared_ptr small2_u_expected; std::shared_ptr mtx_big; std::shared_ptr big_l_expected; std::shared_ptr big_u_expected; - std::unique_ptr::Factory> ilu_factory_skip; - std::unique_ptr::Factory> ilu_factory_sort; + std::shared_ptr mtx_big_nodiag; + std::shared_ptr big_nodiag_l_expected; + std::shared_ptr big_nodiag_u_expected; + std::unique_ptr ilu_factory_skip; + std::unique_ptr ilu_factory_sort; }; +TYPED_TEST_CASE(ParIlu, gko::test::ValueIndexTypes); + + +TYPED_TEST(ParIlu, KernelAddDiagonalElementsEmpty) +{ + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + using Csr = typename TestFixture::Csr; + auto expected_mtx = + Csr::create(this->ref, this->empty_csr->get_size(), + std::initializer_list{0., 0., 0.}, + std::initializer_list{0, 1, 2}, + std::initializer_list{0, 1, 2, 3}); + auto empty_mtx = this->empty_csr->clone(); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(empty_mtx), true); + + GKO_ASSERT_MTX_NEAR(empty_mtx, expected_mtx, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(empty_mtx, expected_mtx); +} + + +TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquare) +{ + using Csr = typename TestFixture::Csr; + auto matrix = gko::initialize( + {{0., 0., 0.}, {1., 0., 0.}, {1., 1., 1.}, {1., 1., 1.}}, this->ref); + auto exp_values = {0., 1., 0., 1., 1., 1., 1., 1., 1.}; + auto exp_col_idxs = {0, 0, 1, 0, 1, 2, 0, 1, 2}; + auto exp_row_ptrs = {0, 1, 3, 6, 9}; + auto expected_mtx = + Csr::create(this->ref, matrix->get_size(), std::move(exp_values), + std::move(exp_col_idxs), std::move(exp_row_ptrs)); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(matrix), true); + + GKO_ASSERT_MTX_NEAR(matrix, expected_mtx, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(matrix, expected_mtx); +} + + +TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquare2) +{ + using Csr = typename TestFixture::Csr; + auto matrix = gko::initialize({{1., 0., 0.}, {1., 0., 0.}}, this->ref); + auto exp_values = {1., 1., 0.}; + auto exp_col_idxs = {0, 0, 1}; + auto exp_row_ptrs = {0, 1, 3}; + auto expected_mtx = + Csr::create(this->ref, matrix->get_size(), std::move(exp_values), + std::move(exp_col_idxs), std::move(exp_row_ptrs)); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(matrix), true); + + GKO_ASSERT_MTX_NEAR(matrix, expected_mtx, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(matrix, expected_mtx); +} + -TEST_F(ParIlu, KernelInitializeRowPtrsLU) +TYPED_TEST(ParIlu, KernelAddDiagonalElementsUnsorted) { - auto small_csr_l_expected = Csr::create(ref); - small_l_expected->convert_to(gko::lend(small_csr_l_expected)); - auto small_csr_u_expected = Csr::create(ref); - small_u_expected->convert_to(gko::lend(small_csr_u_expected)); - auto num_row_ptrs = mtx_csr_small->get_size()[0] + 1; + using Csr = typename TestFixture::Csr; + auto size = gko::dim<2>{3, 3}; + /* matrix: + 1 2 3 + 1 0 3 + 1 2 0 + */ + auto mtx_values = {3., 2., 1., 3., 1., 2., 1.}; + auto mtx_col_idxs = {2, 1, 0, 2, 0, 1, 0}; + auto mtx_row_ptrs = {0, 3, 5, 7}; + auto matrix = Csr::create(this->ref, size, std::move(mtx_values), + std::move(mtx_col_idxs), std::move(mtx_row_ptrs)); + auto exp_values = {1., 2., 3., 1., 0., 3., 1., 2., 0.}; + auto exp_col_idxs = {0, 1, 2, 0, 1, 2, 0, 1, 2}; + auto exp_row_ptrs = {0, 3, 6, 9}; + auto expected_mtx = + Csr::create(this->ref, size, std::move(exp_values), + std::move(exp_col_idxs), std::move(exp_row_ptrs)); + + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(matrix), false); + + GKO_ASSERT_MTX_NEAR(matrix, expected_mtx, 0.); + GKO_ASSERT_MTX_EQ_SPARSITY(matrix, expected_mtx); +} + + +TYPED_TEST(ParIlu, KernelInitializeRowPtrsLU) +{ + using Csr = typename TestFixture::Csr; + using index_type = typename TestFixture::index_type; + auto small_csr_l_expected = Csr::create(this->ref); + this->small_l_expected->convert_to(gko::lend(small_csr_l_expected)); + auto small_csr_u_expected = Csr::create(this->ref); + this->small_u_expected->convert_to(gko::lend(small_csr_u_expected)); + auto num_row_ptrs = this->mtx_csr_small->get_size()[0] + 1; std::vector l_row_ptrs_vector(num_row_ptrs); std::vector u_row_ptrs_vector(num_row_ptrs); auto l_row_ptrs = l_row_ptrs_vector.data(); auto u_row_ptrs = u_row_ptrs_vector.data(); - gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u( - ref, gko::lend(mtx_csr_small), l_row_ptrs, u_row_ptrs); + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( + this->ref, gko::lend(this->mtx_csr_small), l_row_ptrs, u_row_ptrs); ASSERT_TRUE(std::equal(l_row_ptrs, l_row_ptrs + num_row_ptrs, small_csr_l_expected->get_const_row_ptrs())); @@ -180,20 +327,51 @@ TEST_F(ParIlu, KernelInitializeRowPtrsLU) } -TEST_F(ParIlu, KernelInitializeLU) +TYPED_TEST(ParIlu, KernelInitializeRowPtrsLUZeroMatrix) { + using index_type = typename TestFixture::index_type; + using Csr = typename TestFixture::Csr; + auto empty_mtx = this->empty_csr->clone(); + gko::kernels::reference::factorization::add_diagonal_elements( + this->ref, gko::lend(empty_mtx), true); + auto empty_mtx_l_expected = Csr::create(this->ref); + this->identity->convert_to(gko::lend(empty_mtx_l_expected)); + auto empty_mtx_u_expected = Csr::create(this->ref); + this->identity->convert_to(gko::lend(empty_mtx_u_expected)); + auto num_row_ptrs = empty_mtx->get_size()[0] + 1; + std::vector l_row_ptrs_vector(num_row_ptrs); + std::vector u_row_ptrs_vector(num_row_ptrs); + auto l_row_ptrs = l_row_ptrs_vector.data(); + auto u_row_ptrs = u_row_ptrs_vector.data(); + + gko::kernels::reference::factorization::initialize_row_ptrs_l_u( + this->ref, gko::lend(empty_mtx), l_row_ptrs, u_row_ptrs); + + ASSERT_TRUE(std::equal(l_row_ptrs, l_row_ptrs + num_row_ptrs, + empty_mtx_l_expected->get_const_row_ptrs())); + ASSERT_TRUE(std::equal(u_row_ptrs, u_row_ptrs + num_row_ptrs, + empty_mtx_u_expected->get_const_row_ptrs())); +} + + +TYPED_TEST(ParIlu, KernelInitializeLU) +{ + using Dense = typename TestFixture::Dense; + using Csr = typename TestFixture::Csr; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; // clang-format off auto expected_l = gko::initialize({{1., 0., 0.}, {2., 1., 0.}, - {1., 1., 1.}}, ref); + {1., 1., 1.}}, this->ref); auto expected_u = gko::initialize({{4., 6., 8.}, {0., 2., 5.}, - {0., 0., 1.}}, ref); + {0., 0., 1.}}, this->ref); // clang-format on - auto actual_l = Csr::create(ref, mtx_csr_small->get_size(), 6); - auto actual_u = Csr::create(ref, mtx_csr_small->get_size(), 6); + auto actual_l = Csr::create(this->ref, this->mtx_csr_small->get_size(), 6); + auto actual_u = Csr::create(this->ref, this->mtx_csr_small->get_size(), 6); // Copy row_ptrs into matrices, which usually come from the // `initialize_row_ptrs_l_u` kernel std::vector l_row_ptrs{0, 1, 3, 6}; @@ -201,258 +379,359 @@ TEST_F(ParIlu, KernelInitializeLU) std::copy(l_row_ptrs.begin(), l_row_ptrs.end(), actual_l->get_row_ptrs()); std::copy(u_row_ptrs.begin(), u_row_ptrs.end(), actual_u->get_row_ptrs()); - gko::kernels::reference::par_ilu_factorization::initialize_l_u( - ref, gko::lend(mtx_csr_small), gko::lend(actual_l), + gko::kernels::reference::factorization::initialize_l_u( + this->ref, gko::lend(this->mtx_csr_small), gko::lend(actual_l), + gko::lend(actual_u)); + + GKO_ASSERT_MTX_NEAR(actual_l, expected_l, r::value); + GKO_ASSERT_MTX_NEAR(actual_u, expected_u, r::value); +} + + +TYPED_TEST(ParIlu, KernelInitializeLUZeroMatrix) +{ + using value_type = typename TestFixture::value_type; + using Csr = typename TestFixture::Csr; + auto actual_l = Csr::create(this->ref); + auto actual_u = Csr::create(this->ref); + actual_l->copy_from(gko::lend(this->identity)); + actual_u->copy_from(gko::lend(this->identity)); + + gko::kernels::reference::factorization::initialize_l_u( + this->ref, gko::lend(this->empty_csr), gko::lend(actual_l), gko::lend(actual_u)); - GKO_ASSERT_MTX_NEAR(actual_l, expected_l, 1e-14); - GKO_ASSERT_MTX_NEAR(actual_u, expected_u, 1e-14); + GKO_ASSERT_MTX_NEAR(actual_l, this->identity, r::value); + GKO_ASSERT_MTX_NEAR(actual_u, this->identity, r::value); } -TEST_F(ParIlu, KernelComputeLU) +TYPED_TEST(ParIlu, KernelComputeLU) { + using value_type = typename TestFixture::value_type; + using Dense = typename TestFixture::Dense; + using Coo = typename TestFixture::Coo; + using Csr = typename TestFixture::Csr; // clang-format off auto l_dense = gko::initialize({{1., 0., 0.}, {2., 1., 0.}, - {1., 1., 1.}}, ref); + {1., 1., 1.}}, this->ref); // U must be transposed before calling the kernel, so we simply create it // transposed auto u_dense = gko::initialize({{4., 0., 0.}, {6., 2., 0.}, - {8., 5., 1.}}, ref); + {8., 5., 1.}}, this->ref); // clang-format on - auto l_csr = Csr::create(ref); - auto u_csr = Csr::create(ref); - auto mtx_coo = Coo::create(ref); + auto l_csr = Csr::create(this->ref); + auto u_csr = Csr::create(this->ref); + auto mtx_coo = Coo::create(this->ref); constexpr unsigned int iterations = 1; l_dense->convert_to(gko::lend(l_csr)); u_dense->convert_to(gko::lend(u_csr)); - mtx_small->convert_to(gko::lend(mtx_coo)); + this->mtx_small->convert_to(gko::lend(mtx_coo)); // The expected result of U also needs to be transposed - auto u_expected_lin_op = small_u_expected->transpose(); + auto u_expected_lin_op = this->small_u_expected->transpose(); auto u_expected = std::unique_ptr( static_cast(u_expected_lin_op.release())); gko::kernels::reference::par_ilu_factorization::compute_l_u_factors( - ref, iterations, gko::lend(mtx_coo), gko::lend(l_csr), + this->ref, iterations, gko::lend(mtx_coo), gko::lend(l_csr), gko::lend(u_csr)); - GKO_ASSERT_MTX_NEAR(l_csr, small_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_csr, u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_csr, this->small_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_csr, u_expected, r::value); } -TEST_F(ParIlu, ThrowNotSupportedForWrongLinOp1) +TYPED_TEST(ParIlu, ThrowNotSupportedForWrongLinOp1) { - auto linOp = DummyLinOp::create(ref); + auto linOp = DummyLinOp::create(this->ref); - ASSERT_THROW(ilu_factory_skip->generate(gko::share(linOp)), + ASSERT_THROW(this->ilu_factory_skip->generate(gko::share(linOp)), gko::NotSupported); } -TEST_F(ParIlu, ThrowNotSupportedForWrongLinOp2) +TYPED_TEST(ParIlu, ThrowNotSupportedForWrongLinOp2) { - auto linOp = DummyLinOp::create(ref); + auto linOp = DummyLinOp::create(this->ref); - ASSERT_THROW(ilu_factory_sort->generate(gko::share(linOp)), + ASSERT_THROW(this->ilu_factory_sort->generate(gko::share(linOp)), gko::NotSupported); } -TEST_F(ParIlu, ThrowDimensionMismatch) +TYPED_TEST(ParIlu, ThrowDimensionMismatch) { - auto matrix = Csr::create(ref, gko::dim<2>{2, 3}, 4); + using Csr = typename TestFixture::Csr; + auto matrix = Csr::create(this->ref, gko::dim<2>{2, 3}, 4); - ASSERT_THROW(ilu_factory_sort->generate(gko::share(matrix)), + ASSERT_THROW(this->ilu_factory_sort->generate(gko::share(matrix)), gko::DimensionMismatch); } -TEST_F(ParIlu, LUFactorFunctionsSetProperly) +TYPED_TEST(ParIlu, SetLStrategy) +{ + using Csr = typename TestFixture::Csr; + using par_ilu_type = typename TestFixture::par_ilu_type; + auto l_strategy = std::make_shared(); + + auto factory = + par_ilu_type::build().with_l_strategy(l_strategy).on(this->ref); + auto par_ilu = factory->generate(this->mtx_small); + + ASSERT_EQ(factory->get_parameters().l_strategy, l_strategy); + ASSERT_EQ(par_ilu->get_l_factor()->get_strategy()->get_name(), + l_strategy->get_name()); +} + + +TYPED_TEST(ParIlu, SetUStrategy) { - auto factors = ilu_factory_skip->generate(mtx_small); + using Csr = typename TestFixture::Csr; + using par_ilu_type = typename TestFixture::par_ilu_type; + auto u_strategy = std::make_shared(); + + auto factory = + par_ilu_type::build().with_u_strategy(u_strategy).on(this->ref); + auto par_ilu = factory->generate(this->mtx_small); + + ASSERT_EQ(factory->get_parameters().u_strategy, u_strategy); + ASSERT_EQ(par_ilu->get_u_factor()->get_strategy()->get_name(), + u_strategy->get_name()); +} + + +TYPED_TEST(ParIlu, LUFactorFunctionsSetProperly) +{ + auto factors = this->ilu_factory_skip->generate(this->mtx_small); auto lin_op_l_factor = - static_cast(factors->get_l_factor().get()); + static_cast(gko::lend(factors->get_l_factor())); auto lin_op_u_factor = - static_cast(factors->get_u_factor().get()); - auto first_operator = factors->get_operators()[0].get(); - auto second_operator = factors->get_operators()[1].get(); + static_cast(gko::lend(factors->get_u_factor())); + auto first_operator = gko::lend(factors->get_operators()[0]); + auto second_operator = gko::lend(factors->get_operators()[1]); ASSERT_EQ(lin_op_l_factor, first_operator); ASSERT_EQ(lin_op_u_factor, second_operator); } -TEST_F(ParIlu, GenerateForCooIdentity) +TYPED_TEST(ParIlu, GenerateForCooIdentity) { - auto coo_mtx = gko::share(Coo::create(exec)); - identity->convert_to(coo_mtx.get()); + using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + auto coo_mtx = gko::share(Coo::create(this->exec)); + this->identity->convert_to(gko::lend(coo_mtx)); - auto factors = ilu_factory_skip->generate(coo_mtx); + auto factors = this->ilu_factory_skip->generate(coo_mtx); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r::value); } -TEST_F(ParIlu, GenerateForCsrIdentity) +TYPED_TEST(ParIlu, GenerateForCsrIdentity) { - auto csr_mtx = gko::share(Csr::create(exec)); - identity->convert_to(csr_mtx.get()); + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto csr_mtx = gko::share(Csr::create(this->exec)); + this->identity->convert_to(gko::lend(csr_mtx)); - auto factors = ilu_factory_skip->generate(csr_mtx); + auto factors = this->ilu_factory_skip->generate(csr_mtx); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r::value); } -TEST_F(ParIlu, GenerateForDenseIdentity) +TYPED_TEST(ParIlu, GenerateForDenseIdentity) { - auto factors = ilu_factory_skip->generate(identity); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->identity); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r::value); } -TEST_F(ParIlu, GenerateForDenseLowerTriangular) +TYPED_TEST(ParIlu, GenerateForDenseLowerTriangular) { - auto factors = ilu_factory_skip->generate(lower_triangular); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->lower_triangular); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, lower_triangular, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, identity, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->lower_triangular, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->identity, r::value); } -TEST_F(ParIlu, GenerateForDenseUpperTriangular) +TYPED_TEST(ParIlu, GenerateForDenseUpperTriangular) { - auto factors = ilu_factory_skip->generate(upper_triangular); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->upper_triangular); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, identity, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, upper_triangular, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->identity, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->upper_triangular, r::value); } -TEST_F(ParIlu, ApplyMethodDenseSmall) +TYPED_TEST(ParIlu, ApplyMethodDenseSmall) { - const auto x = gko::initialize({1., 2., 3.}, exec); + using value_type = typename TestFixture::value_type; + using Dense = typename TestFixture::Dense; + const auto x = gko::initialize({1., 2., 3.}, this->exec); auto b_lu = Dense::create_with_config_of(gko::lend(x)); auto b_ref = Dense::create_with_config_of(gko::lend(x)); - auto factors = ilu_factory_skip->generate(mtx_small); + auto factors = this->ilu_factory_skip->generate(this->mtx_small); factors->apply(gko::lend(x), gko::lend(b_lu)); - mtx_small->apply(gko::lend(x), gko::lend(b_ref)); + this->mtx_small->apply(gko::lend(x), gko::lend(b_ref)); + + GKO_ASSERT_MTX_NEAR(b_lu, b_ref, r::value); +} + + +TYPED_TEST(ParIlu, GenerateForDenseSmall) +{ + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->mtx_small); + auto l_factor = factors->get_l_factor(); + auto u_factor = factors->get_u_factor(); + + GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r::value); +} + + +TYPED_TEST(ParIlu, GenerateForCsrSmall) +{ + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->mtx_csr_small); + auto l_factor = factors->get_l_factor(); + auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(b_lu, b_ref, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r::value); } -TEST_F(ParIlu, GenerateForDenseSmall) +TYPED_TEST(ParIlu, GenerateForCsrSmall2ZeroDiagonal) { - auto factors = ilu_factory_skip->generate(mtx_small); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->mtx_csr_small2); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->small2_l_expected, + r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->small2_u_expected, + r::value); } -TEST_F(ParIlu, GenerateForCsrSmall) +TYPED_TEST(ParIlu, GenerateForCsrBigWithDiagonalZeros) { - auto factors = ilu_factory_skip->generate(mtx_csr_small); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->mtx_big_nodiag); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->big_nodiag_l_expected, + r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->big_nodiag_u_expected, + r::value); } -TEST_F(ParIlu, GenerateForDenseSmallWithMultipleIterations) +TYPED_TEST(ParIlu, GenerateForDenseSmallWithMultipleIterations) { - auto multiple_iter_factory = gko::factorization::ParIlu<>::build() - .with_iterations(5u) - .with_skip_sorting(true) - .on(exec); - auto factors = multiple_iter_factory->generate(mtx_small); + using value_type = typename TestFixture::value_type; + using par_ilu_type = typename TestFixture::par_ilu_type; + auto multiple_iter_factory = + par_ilu_type::build().with_iterations(5u).with_skip_sorting(true).on( + this->exec); + auto factors = multiple_iter_factory->generate(this->mtx_small); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r::value); } -TEST_F(ParIlu, GenerateForDenseBig) +TYPED_TEST(ParIlu, GenerateForDenseBig) { - auto factors = ilu_factory_skip->generate(mtx_big); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->mtx_big); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, big_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, big_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->big_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->big_u_expected, r::value); } -TEST_F(ParIlu, GenerateForDenseBigSort) +TYPED_TEST(ParIlu, GenerateForDenseBigSort) { - auto factors = ilu_factory_skip->generate(mtx_big); + using value_type = typename TestFixture::value_type; + auto factors = this->ilu_factory_skip->generate(this->mtx_big); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, big_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, big_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->big_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->big_u_expected, r::value); } -TEST_F(ParIlu, GenerateForReverseCooSmall) +TYPED_TEST(ParIlu, GenerateForReverseCooSmall) { - const auto size = mtx_small->get_size(); + using value_type = typename TestFixture::value_type; + using Coo = typename TestFixture::Coo; + const auto size = this->mtx_small->get_size(); const auto nnz = size[0] * size[1]; - auto reverse_coo = gko::share(Coo::create(exec, size, nnz)); + auto reverse_coo = gko::share(Coo::create(this->exec, size, nnz)); // Fill the Coo matrix in reversed row order (right to left) for (size_t i = 0; i < size[0]; ++i) { for (size_t j = 0; j < size[1]; ++j) { const auto coo_idx = i * size[1] + (size[1] - 1 - j); reverse_coo->get_row_idxs()[coo_idx] = i; reverse_coo->get_col_idxs()[coo_idx] = j; - reverse_coo->get_values()[coo_idx] = mtx_small->at(i, j); + reverse_coo->get_values()[coo_idx] = this->mtx_small->at(i, j); } } - auto factors = ilu_factory_sort->generate(reverse_coo); + auto factors = this->ilu_factory_sort->generate(reverse_coo); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(reverse_coo, mtx_small, 1e-14); - GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(reverse_coo, this->mtx_small, r::value); + GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r::value); } -TEST_F(ParIlu, GenerateForReverseCsrSmall) +TYPED_TEST(ParIlu, GenerateForReverseCsrSmall) { - const auto size = mtx_csr_small->get_size(); + using value_type = typename TestFixture::value_type; + using Csr = typename TestFixture::Csr; + const auto size = this->mtx_csr_small->get_size(); const auto nnz = size[0] * size[1]; - auto reverse_csr = gko::share(Csr::create(exec)); - reverse_csr->copy_from(mtx_csr_small.get()); + auto reverse_csr = gko::share(Csr::create(this->exec)); + reverse_csr->copy_from(gko::lend(this->mtx_csr_small)); // Fill the Csr matrix rows in reverse order for (size_t i = 0; i < size[0]; ++i) { const auto row_start = reverse_csr->get_row_ptrs()[i]; @@ -460,18 +739,18 @@ TEST_F(ParIlu, GenerateForReverseCsrSmall) for (size_t j = row_start; j < row_end; ++j) { const auto reverse_j = row_end - 1 - (j - row_start); reverse_csr->get_values()[reverse_j] = - mtx_csr_small->get_const_values()[j]; + this->mtx_csr_small->get_const_values()[j]; reverse_csr->get_col_idxs()[reverse_j] = - mtx_csr_small->get_const_col_idxs()[j]; + this->mtx_csr_small->get_const_col_idxs()[j]; } } - auto factors = ilu_factory_sort->generate(reverse_csr); + auto factors = this->ilu_factory_sort->generate(reverse_csr); auto l_factor = factors->get_l_factor(); auto u_factor = factors->get_u_factor(); - GKO_ASSERT_MTX_NEAR(l_factor, small_l_expected, 1e-14); - GKO_ASSERT_MTX_NEAR(u_factor, small_u_expected, 1e-14); + GKO_ASSERT_MTX_NEAR(l_factor, this->small_l_expected, r::value); + GKO_ASSERT_MTX_NEAR(u_factor, this->small_u_expected, r::value); } diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp new file mode 100644 index 00000000000..a72dd6206f7 --- /dev/null +++ b/reference/test/factorization/par_ilut_kernels.cpp @@ -0,0 +1,675 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/factorization/par_ilut_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +class DummyLinOp : public gko::EnableLinOp, + public gko::EnableCreateMethod { +public: + DummyLinOp(std::shared_ptr exec, + gko::dim<2> size = gko::dim<2>{}) + : EnableLinOp(exec, size) + {} + +protected: + void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override {} + + void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b, + const gko::LinOp *beta, gko::LinOp *x) const override + {} +}; + + +template +class ParIlut : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using factorization_type = + gko::factorization::ParIlut; + using Dense = gko::matrix::Dense; + using Coo = gko::matrix::Coo; + using Csr = gko::matrix::Csr; + using ComplexCsr = + gko::matrix::Csr>, + index_type>; + + ParIlut() + : ref(gko::ReferenceExecutor::create()), + exec(std::static_pointer_cast(ref)), + + mtx1(gko::initialize({{.1, 0., 0., 0.}, + {.1, .1, 0., 0.}, + {-1., -2., -1., 0.}, + {-2., -3., -1., 1.}}, + ref)), + mtx1_expect_thrm2(gko::initialize({{.1, 0., 0., 0.}, + {0., .1, 0., 0.}, + {0., -2., -1., 0.}, + {-2., -3., 0., 1.}}, + ref)), + mtx1_expect_thrm3(gko::initialize({{.1, 0., 0., 0.}, + {0., .1, 0., 0.}, + {0., 0., -1., 0.}, + {0., -3., 0., 1.}}, + ref)), + mtx1_complex(gko::initialize( + {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, + {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}}, + {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, + {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}}, + ref)), + mtx1_expect_complex_thrm(gko::initialize( + {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, + {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}}, + {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, + {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}}, + ref)), + identity(gko::initialize( + {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)), + lower_tri(gko::initialize( + {{1., 0., 0.}, {1., 1., 0.}, {1., 1., 1.}}, ref)), + upper_tri(gko::initialize( + {{2., 1., 1.}, {0., -3., 1.}, {0., 0., 4.}}, ref)), + mtx_system(gko::initialize({{1., 6., 4., 7.}, + {2., -5., 0., 8.}, + {.5, -3., 6., 0.}, + {.2, -.5, -9., 0.}}, + ref)), + mtx_l_system(gko::initialize({{1., 0., 0., 0.}, + {2., 1., 0., 0.}, + {.5, -3., 1., 0.}, + {.2, -.5, -9., 1.}}, + ref)), + mtx_u_system(gko::initialize({{1., 6., 4., 7.}, + {0., 1., 0., 8.}, + {0., 0., 6., 0.}, + {0., 0., 0., 1.}}, + ref)), + mtx_l(gko::initialize({{1., 0., 0., 0.}, + {4., 1., 0., 0.}, + {-1., 0., 1., 0.}, + {0., -3., -1., 1.}}, + ref)), + mtx_u(gko::initialize({{2., 0., 1., 1.}, + {0., 3., 0., 2.}, + {0., 0., .5, 0.}, + {0., 0., 0., 4.}}, + ref)), + mtx_lu(gko::initialize({{1., 2., 3., 4.}, + {0., 6., 7., 8.}, + {9., .1, .2, 0.}, + {.3, .4, .5, .6}}, + ref)), + mtx_l_add_expect(gko::initialize({{1., 0., 0., 0.}, + {4., 1., 0., 0.}, + {-1., -3.1 / 3., 1., 0.}, + {-.05, -3., -1., 1.}}, + ref)), + mtx_u_add_expect(gko::initialize({{2., 4., 1., 1.}, + {0., 3., -7., 2.}, + {0., 0., .5, 0.}, + {0., 0., 0., 4.}}, + ref)), + mtx_l_it_expect(gko::initialize({{1., 0., 0., 0.}, + {2., 1., 0., 0.}, + {.5, 6. / 17., 1., 0.}, + {.2, .1, -2.45, 1.}}, + ref)), + mtx_u_it_expect(gko::initialize({{1., 0., 0., 0.}, + {6., -17., 0., 0.}, + {4., 0., 4., 0.}, + {7., -6., 0., -.8}}, + ref)), + mtx_l_small_expect(gko::initialize({{1., 0., 0., 0.}, + {2., 1., 0., 0.}, + {.5, 6. / 17., 1., 0.}, + {0., 0., -153. / 116., 1.}}, + ref)), + mtx_u_small_expect(gko::initialize({{1., 6., 4., 7.}, + {0., -17., -8., -6.}, + {0., 0., 116. / 17., 0.}, + {0., 0., 0., .0}}, + ref)), + mtx_l_large_expect( + gko::initialize({{1., 0., 0., 0.}, + {2., 1., 0., 0.}, + {.5, 6. / 17., 1., 0.}, + {0.2, 0.1, -153. / 116., 1.}}, + ref)), + mtx_u_large_expect( + gko::initialize({{1., 6., 4., 7.}, + {0., -17., -8., -6.}, + {0., 0., 116. / 17., -47. / 34.}, + {0., 0., 0., -3043. / 1160.}}, + ref)), + fact_fact(factorization_type::build().on(exec)), + tol{r::value} + {} + + template + void test_select(const std::unique_ptr &mtx, index_type rank, + gko::remove_complex expected, + gko::remove_complex tolerance = 0.0) + { + using ValueType = typename Mtx::value_type; + gko::remove_complex result{}; + + gko::remove_complex res{}; + gko::remove_complex dres{}; + gko::Array tmp(ref); + gko::Array> tmp2(ref); + gko::kernels::reference::par_ilut_factorization::threshold_select( + ref, mtx.get(), rank, tmp, tmp2, result); + + ASSERT_NEAR(result, expected, tolerance); + } + + template > + void test_filter(const std::unique_ptr &mtx, + gko::remove_complex threshold, + const std::unique_ptr &expected, bool lower) + { + auto res_mtx = Mtx::create(exec, mtx->get_size()); + auto res_mtx_coo = Coo::create(exec, mtx->get_size()); + + auto local_mtx = gko::as(lower ? mtx->clone() : mtx->transpose()); + auto local_expected = + gko::as(lower ? expected->clone() : expected->transpose()); + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, local_mtx.get(), threshold, res_mtx.get(), res_mtx_coo.get(), + lower); + + GKO_ASSERT_MTX_EQ_SPARSITY(local_expected, res_mtx); + GKO_ASSERT_MTX_NEAR(local_expected, res_mtx, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx, res_mtx_coo); + GKO_ASSERT_MTX_NEAR(res_mtx, res_mtx_coo, 0); + } + + template > + void test_filter_approx(const std::unique_ptr &mtx, index_type rank, + const std::unique_ptr &expected) + { + auto res_mtx = Mtx::create(exec, mtx->get_size()); + auto res_mtx_coo = Coo::create(exec, mtx->get_size()); + auto res_mtx2 = Mtx::create(exec, mtx->get_size()); + auto res_mtx_coo2 = Coo::create(exec, mtx->get_size()); + + auto tmp = gko::Array{exec}; + gko::remove_complex threshold{}; + gko::kernels::reference::par_ilut_factorization:: + threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold, + res_mtx.get(), res_mtx_coo.get()); + gko::kernels::reference::par_ilut_factorization::threshold_filter( + ref, mtx.get(), threshold, res_mtx2.get(), res_mtx_coo2.get(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(expected, res_mtx); + GKO_ASSERT_MTX_EQ_SPARSITY(expected, res_mtx2); + GKO_ASSERT_MTX_NEAR(expected, res_mtx, 0); + GKO_ASSERT_MTX_NEAR(expected, res_mtx2, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx, res_mtx_coo); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx, res_mtx_coo2); + GKO_ASSERT_MTX_NEAR(res_mtx, res_mtx_coo, 0); + GKO_ASSERT_MTX_NEAR(res_mtx, res_mtx_coo2, 0); + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::unique_ptr mtx1; + std::unique_ptr mtx1_expect_thrm2; + std::unique_ptr mtx1_expect_thrm3; + std::unique_ptr mtx1_complex; + std::unique_ptr mtx1_expect_complex_thrm; + std::shared_ptr identity; + std::shared_ptr lower_tri; + std::shared_ptr upper_tri; + std::shared_ptr mtx_system; + std::unique_ptr mtx_l_system; + std::unique_ptr mtx_u_system; + std::unique_ptr mtx_l; + std::unique_ptr mtx_u; + std::unique_ptr mtx_lu; + std::unique_ptr mtx_l_add_expect; + std::unique_ptr mtx_u_add_expect; + std::unique_ptr mtx_l_it_expect; + std::unique_ptr mtx_u_it_expect; + std::unique_ptr mtx_l_small_expect; + std::unique_ptr mtx_u_small_expect; + std::unique_ptr mtx_l_large_expect; + std::unique_ptr mtx_u_large_expect; + std::unique_ptr fact_fact; + gko::remove_complex tol; +}; // namespace + +TYPED_TEST_CASE(ParIlut, gko::test::ValueIndexTypes); + + +TYPED_TEST(ParIlut, KernelThresholdSelect) +{ + this->test_select(this->mtx1, 7, 2.0); +} + + +TYPED_TEST(ParIlut, KernelThresholdSelectMin) +{ + this->test_select(this->mtx1, 0, 0.1); +} + + +TYPED_TEST(ParIlut, KernelThresholdSelectMax) +{ + this->test_select(this->mtx1, 9, 3.0); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdSelect) +{ + using value_type = typename TestFixture::value_type; + this->test_select(this->mtx1_complex, 5, sqrt(2), this->tol); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdSelectMin) +{ + using value_type = typename TestFixture::value_type; + this->test_select(this->mtx1_complex, 0, 0.1, this->tol); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdSelectMax) +{ + using value_type = typename TestFixture::value_type; + this->test_select(this->mtx1_complex, 9, sqrt(9.01), this->tol); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCoo) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + auto res_mtx = Csr::create(this->exec, this->mtx1->get_size()); + Coo *null_coo = nullptr; + + gko::kernels::reference::par_ilut_factorization::threshold_filter( + this->ref, this->mtx1.get(), 0.0, res_mtx.get(), null_coo, true); + + GKO_ASSERT_MTX_EQ_SPARSITY(this->mtx1, res_mtx); + GKO_ASSERT_MTX_NEAR(this->mtx1, res_mtx, 0); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNoneLower) +{ + this->test_filter(this->mtx1, 0.0, this->mtx1, true); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNoneUpper) +{ + this->test_filter(this->mtx1, 0.0, this->mtx1, false); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterSomeAtThresholdLower) +{ + this->test_filter(this->mtx1, 2.0, this->mtx1_expect_thrm2, true); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterSomeAtThresholdUpper) +{ + this->test_filter(this->mtx1, 2.0, this->mtx1_expect_thrm2, false); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterSomeAboveThresholdLower) +{ + this->test_filter(this->mtx1, 3.0, this->mtx1_expect_thrm3, true); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterSomeAboveThresholdUpper) +{ + this->test_filter(this->mtx1, 3.0, this->mtx1_expect_thrm3, false); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdFilterNoneLower) +{ + this->test_filter(this->mtx1_complex, 0.0, this->mtx1_complex, true); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdFilterNoneUpper) +{ + this->test_filter(this->mtx1_complex, 0.0, this->mtx1_complex, false); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdFilterSomeAtThresholdLower) +{ + this->test_filter(this->mtx1_complex, 1.01, this->mtx1_expect_complex_thrm, + true); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdFilterSomeAtThresholdUpper) +{ + this->test_filter(this->mtx1_complex, 1.01, this->mtx1_expect_complex_thrm, + false); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCoo) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto res_mtx = Csr::create(this->exec, this->mtx1->get_size()); + auto tmp = gko::Array{this->ref}; + gko::remove_complex threshold{}; + Coo *null_coo = nullptr; + index_type rank{}; + + gko::kernels::reference::par_ilut_factorization::threshold_filter_approx( + this->ref, this->mtx1.get(), rank, tmp, threshold, res_mtx.get(), + null_coo); + + GKO_ASSERT_MTX_EQ_SPARSITY(this->mtx1, res_mtx); + GKO_ASSERT_MTX_NEAR(this->mtx1, res_mtx, 0); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterSomeApprox1) +{ + this->test_filter_approx(this->mtx1, 7, this->mtx1_expect_thrm2); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterSomeApprox2) +{ + this->test_filter_approx(this->mtx1, 8, this->mtx1_expect_thrm2); +} + + +TYPED_TEST(ParIlut, KernelThresholdFilterNoneApprox) +{ + this->test_filter_approx(this->mtx1, 0, this->mtx1); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdFilterSomeApprox) +{ + this->test_filter_approx(this->mtx1_complex, 4, + this->mtx1_expect_complex_thrm); +} + + +TYPED_TEST(ParIlut, KernelComplexThresholdFilterNoneApprox) +{ + this->test_filter_approx(this->mtx1_complex, 0, this->mtx1_complex); +} + + +TYPED_TEST(ParIlut, KernelAddCandidates) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + auto res_mtx_l = Csr::create(this->exec, this->mtx_system->get_size()); + auto res_mtx_u = Csr::create(this->exec, this->mtx_system->get_size()); + + gko::kernels::reference::par_ilut_factorization::add_candidates( + this->ref, this->mtx_lu.get(), this->mtx_system.get(), + this->mtx_l.get(), this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get()); + + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_l, this->mtx_l_add_expect); + GKO_ASSERT_MTX_EQ_SPARSITY(res_mtx_u, this->mtx_u_add_expect); + GKO_ASSERT_MTX_NEAR(res_mtx_l, this->mtx_l_add_expect, this->tol); + GKO_ASSERT_MTX_NEAR(res_mtx_u, this->mtx_u_add_expect, this->tol); +} + + +TYPED_TEST(ParIlut, KernelComputeLU) +{ + using Csr = typename TestFixture::Csr; + using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + auto mtx_l_coo = Coo::create(this->exec, this->mtx_system->get_size()); + this->mtx_l_system->convert_to(mtx_l_coo.get()); + auto mtx_u_transp = this->mtx_u_system->transpose(); + auto mtx_u_coo = Coo::create(this->exec, this->mtx_system->get_size()); + this->mtx_u_system->convert_to(mtx_u_coo.get()); + auto mtx_u_csc = gko::as(mtx_u_transp.get()); + + gko::kernels::reference::par_ilut_factorization::compute_l_u_factors( + this->ref, this->mtx_system.get(), this->mtx_l_system.get(), + mtx_l_coo.get(), this->mtx_u_system.get(), mtx_u_coo.get(), mtx_u_csc); + auto mtx_utt = gko::as(mtx_u_csc->transpose()); + + GKO_ASSERT_MTX_NEAR(this->mtx_l_system, this->mtx_l_it_expect, this->tol); + GKO_ASSERT_MTX_NEAR(mtx_u_csc, this->mtx_u_it_expect, this->tol); + GKO_ASSERT_MTX_NEAR(this->mtx_u_system, mtx_utt, 0); +} + + +TYPED_TEST(ParIlut, ThrowNotSupportedForWrongLinOp) +{ + auto lin_op = DummyLinOp::create(this->ref); + + ASSERT_THROW(this->fact_fact->generate(gko::share(lin_op)), + gko::NotSupported); +} + + +TYPED_TEST(ParIlut, ThrowDimensionMismatch) +{ + using Csr = typename TestFixture::Csr; + auto matrix = Csr::create(this->ref, gko::dim<2>{2, 3}, 4); + + ASSERT_THROW(this->fact_fact->generate(gko::share(matrix)), + gko::DimensionMismatch); +} + + +TYPED_TEST(ParIlut, SetStrategies) +{ + using Csr = typename TestFixture::Csr; + using factorization_type = typename TestFixture::factorization_type; + auto l_strategy = std::make_shared(); + auto u_strategy = std::make_shared(); + + auto factory = factorization_type::build() + .with_l_strategy(l_strategy) + .with_u_strategy(u_strategy) + .on(this->ref); + auto fact = factory->generate(this->mtx_system); + + ASSERT_EQ(factory->get_parameters().l_strategy, l_strategy); + ASSERT_EQ(fact->get_l_factor()->get_strategy()->get_name(), + l_strategy->get_name()); + ASSERT_EQ(factory->get_parameters().u_strategy, u_strategy); + ASSERT_EQ(fact->get_u_factor()->get_strategy()->get_name(), + u_strategy->get_name()); +} + + +TYPED_TEST(ParIlut, IsConsistentWithComposition) +{ + auto fact = this->fact_fact->generate(this->mtx_system); + + auto lin_op_l_factor = + static_cast(gko::lend(fact->get_l_factor())); + auto lin_op_u_factor = + static_cast(gko::lend(fact->get_u_factor())); + auto first_operator = gko::lend(fact->get_operators()[0]); + auto second_operator = gko::lend(fact->get_operators()[1]); + + ASSERT_EQ(lin_op_l_factor, first_operator); + ASSERT_EQ(lin_op_u_factor, second_operator); +} + + +TYPED_TEST(ParIlut, GenerateIdentity) +{ + auto fact = this->fact_fact->generate(this->identity); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->identity, this->tol); +} + + +TYPED_TEST(ParIlut, GenerateDenseIdentity) +{ + using Dense = typename TestFixture::Dense; + auto dense_id = Dense::create(this->exec, this->identity->get_size()); + this->identity->convert_to(dense_id.get()); + auto fact = this->fact_fact->generate(gko::share(dense_id)); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->identity, this->tol); +} + + +TYPED_TEST(ParIlut, GenerateLowerTri) +{ + auto fact = this->fact_fact->generate(this->lower_tri); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->lower_tri, this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->identity, this->tol); +} + + +TYPED_TEST(ParIlut, GenerateUpperTri) +{ + auto fact = this->fact_fact->generate(this->upper_tri); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->identity, this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->upper_tri, this->tol); +} + + +TYPED_TEST(ParIlut, GenerateWithExactSmallLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + auto fact = factorization_type::build() + .with_approximate_select(false) + .with_fill_in_limit(0.75) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_small_expect, + this->tol); +} + + +TYPED_TEST(ParIlut, GenerateWithApproxSmallLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + auto fact = factorization_type::build() + .with_approximate_select(true) + .with_fill_in_limit(0.75) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_small_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_small_expect, + this->tol); +} + + +TYPED_TEST(ParIlut, GenerateWithExactLargeLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + auto fact = factorization_type::build() + .with_approximate_select(false) + .with_fill_in_limit(1.2) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_large_expect, + this->tol); +} + + +TYPED_TEST(ParIlut, GenerateWithApproxLargeLimit) +{ + using factorization_type = typename TestFixture::factorization_type; + auto fact = factorization_type::build() + .with_approximate_select(true) + .with_fill_in_limit(1.2) + .on(this->exec) + ->generate(this->mtx_system); + + GKO_ASSERT_MTX_NEAR(fact->get_l_factor(), this->mtx_l_large_expect, + this->tol); + GKO_ASSERT_MTX_NEAR(fact->get_u_factor(), this->mtx_u_large_expect, + this->tol); +} + + +} // namespace diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp index 637761ee19b..01a9b17c303 100644 --- a/reference/test/log/convergence.cpp +++ b/reference/test/log/convergence.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,37 +36,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include #include #include +#include "core/test/utils.hpp" + + namespace { -TEST(Record, CatchesCriterionCheckCompleted) +template +class Convergence : public ::testing::Test {}; + +TYPED_TEST_CASE(Convergence, gko::test::ValueTypes); + + +TYPED_TEST(Convergence, CatchesCriterionCheckCompleted) { auto exec = gko::ReferenceExecutor::create(); - auto logger = gko::log::Convergence<>::create( + auto logger = gko::log::Convergence::create( exec, gko::log::Logger::criterion_check_completed_mask); auto criterion = gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate( nullptr, nullptr, nullptr); constexpr gko::uint8 RelativeStoppingId{42}; gko::Array stop_status(exec, 1); - using Mtx = gko::matrix::Dense<>; + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; auto residual = gko::initialize({1.0, 2.0, 2.0}, exec); - logger->on( + logger->template on( criterion.get(), 1, residual.get(), nullptr, nullptr, RelativeStoppingId, true, &stop_status, true, true); ASSERT_EQ(logger->get_num_iterations(), 1); GKO_ASSERT_MTX_NEAR(gko::as(logger->get_residual()), l({1.0, 2.0, 2.0}), 0.0); - GKO_ASSERT_MTX_NEAR(gko::as(logger->get_residual_norm()), l({3.0}), - 0.0); + GKO_ASSERT_MTX_NEAR(gko::as(logger->get_residual_norm()), + l({3.0}), 0.0); } diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp index 3482f3aaa10..842b6214374 100644 --- a/reference/test/log/papi.cpp +++ b/reference/test/log/papi.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include @@ -38,18 +37,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include +#include "core/test/utils.hpp" + + namespace { +template class Papi : public ::testing::Test { protected: - using Dense = gko::matrix::Dense<>; + using Dense = gko::matrix::Dense; Papi() : exec(gko::ReferenceExecutor::create()), eventset(PAPI_NULL) {} @@ -67,11 +69,11 @@ class Papi : public ::testing::Test { void TearDown() { eventset = PAPI_NULL; } - template + template const std::string init(const gko::log::Logger::mask_type &event, - const std::string &event_name, T *ptr) + const std::string &event_name, U *ptr) { - logger = gko::log::Papi<>::create(exec, event); + logger = gko::log::Papi::create(exec, event); std::ostringstream os; os << "sde:::" << logger->get_handle_name() << "::" << event_name << "_" << reinterpret_cast(ptr); @@ -108,29 +110,33 @@ class Papi : public ::testing::Test { } } - std::shared_ptr> logger; + std::shared_ptr> logger; std::shared_ptr exec; int eventset; }; +TYPED_TEST_CASE(Papi, gko::test::ValueTypes); + -TEST_F(Papi, CatchesCriterionCheckCompleted) +TYPED_TEST(Papi, CatchesCriterionCheckCompleted) { - auto residual_norm = gko::initialize({4.0}, exec); - auto criterion = - gko::stop::Iteration::build().with_max_iters(3u).on(exec)->generate( - nullptr, nullptr, nullptr); - auto str = init(gko::log::Logger::criterion_check_completed_mask, - "criterion_check_completed", criterion.get()); - add_event(str + ":CNT"); - add_event(str); - - start(); - logger->on( + using Dense = typename TestFixture::Dense; + auto residual_norm = gko::initialize({4.0}, this->exec); + auto criterion = gko::stop::Iteration::build() + .with_max_iters(3u) + .on(this->exec) + ->generate(nullptr, nullptr, nullptr); + auto str = this->init(gko::log::Logger::criterion_check_completed_mask, + "criterion_check_completed", criterion.get()); + this->add_event(str + ":CNT"); + this->add_event(str); + + this->start(); + this->logger->template on( criterion.get(), 0, nullptr, residual_norm.get(), nullptr, 0, false, nullptr, false, false); long long int values[2]; - stop(values); + this->stop(values); double *sde_ptr = GET_SDE_RECORDER_ADDRESS(values[1], double); ASSERT_EQ(values[0], 1); diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt index 7c0b5742eed..d6878d864f3 100644 --- a/reference/test/matrix/CMakeLists.txt +++ b/reference/test/matrix/CMakeLists.txt @@ -4,6 +4,7 @@ ginkgo_create_test(dense_kernels) ginkgo_create_test(ell_kernels) ginkgo_create_test(hybrid_kernels) ginkgo_create_test(identity) +ginkgo_create_test(permutation) ginkgo_create_test(sellp_kernels) ginkgo_create_test(sparsity_csr) ginkgo_create_test(sparsity_csr_kernels) diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 0c0889df6ed..629a06dde23 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/coo_kernels.hpp" +#include #include @@ -41,22 +41,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include #include -#include "core/test/utils/assertions.hpp" +#include "core/matrix/coo_kernels.hpp" +#include "core/test/utils.hpp" namespace { +template class Coo : public ::testing::Test { protected: - using Csr = gko::matrix::Csr<>; - using Mtx = gko::matrix::Coo<>; - using Vec = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Csr = gko::matrix::Csr; + using Mtx = gko::matrix::Coo; + using Vec = gko::matrix::Dense; Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec)) { @@ -80,61 +85,110 @@ class Coo : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{3.0}); + EXPECT_EQ(v[2], value_type{2.0}); + EXPECT_EQ(v[3], value_type{5.0}); } std::shared_ptr exec; std::unique_ptr mtx; }; +TYPED_TEST_CASE(Coo, gko::test::ValueIndexTypes); + -TEST_F(Coo, ConvertsToCsr) +TYPED_TEST(Coo, ConvertsToPrecision) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_merge); - - mtx->convert_to(csr_mtx_c.get()); - mtx->convert_to(csr_mtx_m.get()); - - assert_equal_to_mtx_in_csr_format(csr_mtx_c.get()); - assert_equal_to_mtx_in_csr_format(csr_mtx_m.get()); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Coo = typename TestFixture::Mtx; + using OtherCoo = gko::matrix::Coo; + auto tmp = OtherCoo::create(this->exec); + auto res = Coo::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx, res, residual); } -TEST_F(Coo, MovesToCsr) +TYPED_TEST(Coo, MovesToPrecision) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_merge); - auto mtx_clone = mtx->clone(); - - mtx->move_to(csr_mtx_c.get()); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Coo = typename TestFixture::Mtx; + using OtherCoo = gko::matrix::Coo; + auto tmp = OtherCoo::create(this->exec); + auto res = Coo::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx->move_to(tmp.get()); + tmp->move_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx, res, residual); +} + + +TYPED_TEST(Coo, ConvertsToCsr) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx->get_executor(), csr_s_merge); + + this->mtx->convert_to(csr_mtx_c.get()); + this->mtx->convert_to(csr_mtx_m.get()); + + this->assert_equal_to_mtx_in_csr_format(csr_mtx_c.get()); + this->assert_equal_to_mtx_in_csr_format(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); +} + + +TYPED_TEST(Coo, MovesToCsr) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx->clone(); + + this->mtx->move_to(csr_mtx_c.get()); mtx_clone->move_to(csr_mtx_m.get()); - assert_equal_to_mtx_in_csr_format(csr_mtx_c.get()); - assert_equal_to_mtx_in_csr_format(csr_mtx_m.get()); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + this->assert_equal_to_mtx_in_csr_format(csr_mtx_c.get()); + this->assert_equal_to_mtx_in_csr_format(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Coo, ConvertsToDense) +TYPED_TEST(Coo, ConvertsToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Dense = typename TestFixture::Vec; + auto dense_mtx = Dense::create(this->mtx->get_executor()); - mtx->convert_to(dense_mtx.get()); + this->mtx->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -144,11 +198,13 @@ TEST_F(Coo, ConvertsToDense) } -TEST_F(Coo, MovesToDense) +TYPED_TEST(Coo, MovesToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); + using value_type = typename TestFixture::value_type; + using Dense = typename TestFixture::Vec; + auto dense_mtx = Dense::create(this->mtx->get_executor()); - mtx->move_to(dense_mtx.get()); + this->mtx->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -158,193 +214,309 @@ TEST_F(Coo, MovesToDense) } -TEST_F(Coo, AppliesToDenseVector) +TYPED_TEST(Coo, ConvertsEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Coo = typename TestFixture::Mtx; + using OtherCoo = gko::matrix::Coo; + auto empty = OtherCoo::create(this->exec); + auto res = Coo::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Coo, MovesEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Coo = typename TestFixture::Mtx; + using OtherCoo = gko::matrix::Coo; + auto empty = OtherCoo::create(this->exec); + auto res = Coo::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Coo, ConvertsEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Coo = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto empty = Coo::create(this->exec); + auto res = Csr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Coo, MovesEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Coo = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto empty = Coo::create(this->exec); + auto res = Csr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Coo, ConvertsEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Coo = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Coo::create(this->exec); + auto res = Dense::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Coo, MovesEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Coo = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Coo::create(this->exec); + auto res = Dense::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Coo, AppliesToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx->apply(x.get(), y.get()); + this->mtx->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Coo, AppliesToDenseMatrix) +TYPED_TEST(Coo, AppliesToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{2, 2}); - mtx->apply(x.get(), y.get()); + this->mtx->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{13.0, 3.5}, - { 5.0, -7.5}}), 0.0); + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); // clang-format on } -TEST_F(Coo, AppliesLinearCombinationToDenseVector) +TYPED_TEST(Coo, AppliesLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Coo, AppliesLinearCombinationToDenseMatrix) +TYPED_TEST(Coo, AppliesLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{-11.0, -2.5}, - { -1.0, 4.5}}), 0.0); + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); // clang-format on } -TEST_F(Coo, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Coo, ApplyFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Coo, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(Coo, ApplyFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Coo, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(Coo, ApplyFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Coo, AppliesAddToDenseVector) +TYPED_TEST(Coo, AppliesAddToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({2.0, 1.0}, exec); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({2.0, 1.0}, this->exec); - mtx->apply2(x.get(), y.get()); + this->mtx->apply2(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({15.0, 6.0}), 0.0); } -TEST_F(Coo, AppliesAddToDenseMatrix) +TYPED_TEST(Coo, AppliesAddToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx->apply2(x.get(), y.get()); + this->mtx->apply2(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{14.0, 4.0}, - { 7.0, -9.0}}), 0.0); + l({{14.0, 4.0}, + { 7.0, -9.0}}), 0.0); // clang-format on } -TEST_F(Coo, AppliesLinearCombinationAddToDenseVector) +TYPED_TEST(Coo, AppliesLinearCombinationAddToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx->apply2(alpha.get(), x.get(), y.get()); + this->mtx->apply2(alpha.get(), x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-12.0, -3.0}), 0.0); } -TEST_F(Coo, AppliesLinearCombinationAddToDenseMatrix) +TYPED_TEST(Coo, AppliesLinearCombinationAddToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx->apply2(alpha.get(), x.get(), y.get()); + this->mtx->apply2(alpha.get(), x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{-12.0, -3.0}, - { -3.0, 6.0}}), 0.0); + l({{-12.0, -3.0}, + { -3.0, 6.0}}), 0.0); // clang-format on } -TEST_F(Coo, ApplyAddFailsOnWrongInnerDimension) +TYPED_TEST(Coo, ApplyAddFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply2(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply2(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Coo, ApplyAddFailsOnWrongNumberOfRows) +TYPED_TEST(Coo, ApplyAddFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx->apply2(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply2(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Coo, ApplyAddFailsOnWrongNumberOfCols) +TYPED_TEST(Coo, ApplyAddFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply2(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply2(x.get(), y.get()), gko::DimensionMismatch); } diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index f775ea018ee..736f90349ad 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/csr_kernels.hpp" +#include #include @@ -43,41 +43,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include +#include #include #include -#include "core/test/utils/assertions.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/test/utils.hpp" namespace { +template class Csr : public ::testing::Test { protected: - using Coo = gko::matrix::Coo<>; - using Mtx = gko::matrix::Csr<>; - using Sellp = gko::matrix::Sellp<>; - using SparsityCsr = gko::matrix::SparsityCsr<>; - using Ell = gko::matrix::Ell<>; - using Hybrid = gko::matrix::Hybrid<>; - using ComplexMtx = gko::matrix::Csr>; - using Vec = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Coo = gko::matrix::Coo; + using Mtx = gko::matrix::Csr; + using Sellp = gko::matrix::Sellp; + using SparsityCsr = gko::matrix::SparsityCsr; + using Ell = gko::matrix::Ell; + using Hybrid = gko::matrix::Hybrid; + using Vec = gko::matrix::Dense; Csr() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4, - std::make_shared(2))), + std::make_shared(2))), mtx2(Mtx::create(exec, gko::dim<2>{2, 3}, 5, - std::make_shared())), + std::make_shared())), mtx3_sorted(Mtx::create(exec, gko::dim<2>(3, 3), 7, - std::make_shared())), - mtx3_unsorted(Mtx::create(exec, gko::dim<2>(3, 3), 7, - std::make_shared())) + std::make_shared())), + mtx3_unsorted( + Mtx::create(exec, gko::dim<2>(3, 3), 7, + std::make_shared())) { this->create_mtx(mtx.get()); this->create_mtx2(mtx2.get()); @@ -86,9 +92,9 @@ class Csr : public ::testing::Test { void create_mtx(Mtx *m) { - Mtx::value_type *v = m->get_values(); - Mtx::index_type *c = m->get_col_idxs(); - Mtx::index_type *r = m->get_row_ptrs(); + value_type *v = m->get_values(); + index_type *c = m->get_col_idxs(); + index_type *r = m->get_row_ptrs(); auto *s = m->get_srow(); /* * 1 3 2 @@ -110,9 +116,9 @@ class Csr : public ::testing::Test { void create_mtx2(Mtx *m) { - Mtx::value_type *v = m->get_values(); - Mtx::index_type *c = m->get_col_idxs(); - Mtx::index_type *r = m->get_row_ptrs(); + value_type *v = m->get_values(); + index_type *c = m->get_col_idxs(); + index_type *r = m->get_row_ptrs(); // It keeps an explict zero /* * 1 3 2 @@ -206,10 +212,10 @@ class Csr : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{3.0}); + EXPECT_EQ(v[2], value_type{2.0}); + EXPECT_EQ(v[3], value_type{5.0}); } void assert_equal_to_mtx(const Sellp *m) @@ -232,12 +238,12 @@ class Csr : public ::testing::Test { EXPECT_EQ(c[65], 0); EXPECT_EQ(c[128], 2); EXPECT_EQ(c[129], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[64], 3.0); - EXPECT_EQ(v[65], 0.0); - EXPECT_EQ(v[128], 2.0); - EXPECT_EQ(v[129], 0.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); + EXPECT_EQ(v[64], value_type{3.0}); + EXPECT_EQ(v[65], value_type{0.0}); + EXPECT_EQ(v[128], value_type{2.0}); + EXPECT_EQ(v[129], value_type{0.0}); } void assert_equal_to_mtx(const SparsityCsr *m) @@ -269,12 +275,12 @@ class Csr : public ::testing::Test { EXPECT_EQ(c[3], 0); EXPECT_EQ(c[4], 2); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 3.0); - EXPECT_EQ(v[3], 0.0); - EXPECT_EQ(v[4], 2.0); - EXPECT_EQ(v[5], 0.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{5.0}); + EXPECT_EQ(v[2], value_type{3.0}); + EXPECT_EQ(v[3], value_type{0.0}); + EXPECT_EQ(v[4], value_type{2.0}); + EXPECT_EQ(v[5], value_type{0.0}); } void assert_equal_to_mtx(const Hybrid *m) @@ -298,10 +304,10 @@ class Csr : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], value_type{1.0}); + EXPECT_EQ(v[1], value_type{3.0}); + EXPECT_EQ(v[2], value_type{2.0}); + EXPECT_EQ(v[3], value_type{5.0}); } void assert_equal_to_mtx2(const Hybrid *m) @@ -319,22 +325,21 @@ class Csr : public ::testing::Test { ASSERT_EQ(m->get_coo_num_stored_elements(), 1); EXPECT_EQ(r[0], 0); EXPECT_EQ(c[0], 2); - EXPECT_EQ(v[0], 2.0); + EXPECT_EQ(v[0], value_type{2.0}); // Test Ell values ASSERT_EQ(m->get_ell_num_stored_elements(), 4); EXPECT_EQ(n, 2); EXPECT_EQ(p, 2); - EXPECT_EQ(ell_v[0], 1); - EXPECT_EQ(ell_v[1], 0); - EXPECT_EQ(ell_v[2], 3); - EXPECT_EQ(ell_v[3], 5); + EXPECT_EQ(ell_v[0], value_type{1}); + EXPECT_EQ(ell_v[1], value_type{0}); + EXPECT_EQ(ell_v[2], value_type{3}); + EXPECT_EQ(ell_v[3], value_type{5}); EXPECT_EQ(ell_c[0], 0); EXPECT_EQ(ell_c[1], 0); EXPECT_EQ(ell_c[2], 1); EXPECT_EQ(ell_c[3], 1); } - std::complex i{0, 1}; std::shared_ptr exec; std::unique_ptr mtx; std::unique_ptr mtx2; @@ -342,232 +347,641 @@ class Csr : public ::testing::Test { std::unique_ptr mtx3_unsorted; }; +TYPED_TEST_CASE(Csr, gko::test::ValueIndexTypes); + -TEST_F(Csr, AppliesToDenseVector) +TYPED_TEST(Csr, AppliesToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); + + this->mtx->apply(x.get(), y.get()); + + EXPECT_EQ(y->at(0), T{13.0}); + EXPECT_EQ(y->at(1), T{5.0}); +} - mtx->apply(x.get(), y.get()); - EXPECT_EQ(y->at(0), 13.0); - EXPECT_EQ(y->at(1), 5.0); +TYPED_TEST(Csr, AppliesToDenseMatrix) +{ + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto x = gko::initialize( + {I{2.0, 3.0}, I{1.0, -1.5}, I{4.0, 2.5}}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + this->mtx->apply(x.get(), y.get()); + + EXPECT_EQ(y->at(0, 0), T{13.0}); + EXPECT_EQ(y->at(1, 0), T{5.0}); + EXPECT_EQ(y->at(0, 1), T{3.5}); + EXPECT_EQ(y->at(1, 1), T{-7.5}); } -TEST_F(Csr, AppliesToDenseMatrix) +TYPED_TEST(Csr, AppliesLinearCombinationToDenseVector) { - auto x = gko::initialize({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx->apply(x.get(), y.get()); + this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); - EXPECT_EQ(y->at(0, 0), 13.0); - EXPECT_EQ(y->at(1, 0), 5.0); - EXPECT_EQ(y->at(0, 1), 3.5); - EXPECT_EQ(y->at(1, 1), -7.5); + EXPECT_EQ(y->at(0), T{-11.0}); + EXPECT_EQ(y->at(1), T{-1.0}); } -TEST_F(Csr, AppliesLinearCombinationToDenseVector) +TYPED_TEST(Csr, AppliesLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize( + {I{2.0, 3.0}, I{1.0, -1.5}, I{4.0, 2.5}}, this->exec); + auto y = + gko::initialize({I{1.0, 0.5}, I{2.0, -1.5}}, this->exec); + + this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); + + EXPECT_EQ(y->at(0, 0), T{-11.0}); + EXPECT_EQ(y->at(1, 0), T{-1.0}); + EXPECT_EQ(y->at(0, 1), T{-2.5}); + EXPECT_EQ(y->at(1, 1), T{4.5}); +} - mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); - EXPECT_EQ(y->at(0), -11.0); - EXPECT_EQ(y->at(1), -1.0); +TYPED_TEST(Csr, AppliesToCsrMatrix) +{ + using T = typename TestFixture::value_type; + this->mtx->apply(this->mtx3_unsorted.get(), this->mtx2.get()); + + ASSERT_EQ(this->mtx2->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx2->get_num_stored_elements(), 6); + ASSERT_TRUE(this->mtx2->is_sorted_by_column_index()); + auto r = this->mtx2->get_const_row_ptrs(); + auto c = this->mtx2->get_const_col_idxs(); + auto v = this->mtx2->get_const_values(); + // 13 5 31 + // 15 5 40 + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 6); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 0); + EXPECT_EQ(c[4], 1); + EXPECT_EQ(c[5], 2); + EXPECT_EQ(v[0], T{13}); + EXPECT_EQ(v[1], T{5}); + EXPECT_EQ(v[2], T{31}); + EXPECT_EQ(v[3], T{15}); + EXPECT_EQ(v[4], T{5}); + EXPECT_EQ(v[5], T{40}); } -TEST_F(Csr, AppliesLinearCombinationToDenseMatrix) +TYPED_TEST(Csr, AppliesLinearCombinationToCsrMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec); - auto y = gko::initialize({{1.0, 0.5}, {2.0, -1.5}}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + + this->mtx->apply(alpha.get(), this->mtx3_unsorted.get(), beta.get(), + this->mtx2.get()); + + ASSERT_EQ(this->mtx2->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(this->mtx2->get_num_stored_elements(), 6); + ASSERT_TRUE(this->mtx2->is_sorted_by_column_index()); + auto r = this->mtx2->get_const_row_ptrs(); + auto c = this->mtx2->get_const_col_idxs(); + auto v = this->mtx2->get_const_values(); + // -11 1 -27 + // -15 5 -40 + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 6); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 0); + EXPECT_EQ(c[4], 1); + EXPECT_EQ(c[5], 2); + EXPECT_EQ(v[0], T{-11}); + EXPECT_EQ(v[1], T{1}); + EXPECT_EQ(v[2], T{-27}); + EXPECT_EQ(v[3], T{-15}); + EXPECT_EQ(v[4], T{5}); + EXPECT_EQ(v[5], T{-40}); +} - mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); - EXPECT_EQ(y->at(0, 0), -11.0); - EXPECT_EQ(y->at(1, 0), -1.0); - EXPECT_EQ(y->at(0, 1), -2.5); - EXPECT_EQ(y->at(1, 1), 4.5); +TYPED_TEST(Csr, AppliesLinearCombinationToIdentityMatrix) +{ + using T = typename TestFixture::value_type; + using Vec = typename TestFixture::Vec; + using Mtx = typename TestFixture::Mtx; + auto alpha = gko::initialize({-3.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto a = gko::initialize( + {I{2.0, 0.0, 3.0}, I{0.0, 1.0, -1.5}, I{0.0, -2.0, 0.0}, + I{5.0, 0.0, 0.0}, I{1.0, 0.0, 4.0}, I{2.0, -2.0, 0.0}, + I{0.0, 0.0, 0.0}}, + this->exec); + auto b = gko::initialize( + {I{2.0, -2.0, 0.0}, I{1.0, 0.0, 4.0}, I{2.0, 0.0, 3.0}, + I{0.0, 1.0, -1.5}, I{1.0, 0.0, 0.0}, I{0.0, 0.0, 0.0}, + I{0.0, 0.0, 0.0}}, + this->exec); + auto expect = gko::initialize( + {I{-2.0, -4.0, -9.0}, I{2.0, -3.0, 12.5}, I{4.0, 6.0, 6.0}, + I{-15.0, 2.0, -3.0}, I{-1.0, 0.0, -12.0}, I{-6.0, 6.0, 0.0}, + I{0.0, 0.0, 0.0}}, + this->exec); + auto id = gko::matrix::Identity::create(this->exec, a->get_size()[1]); + + a->apply(gko::lend(alpha), gko::lend(id), gko::lend(beta), gko::lend(b)); + + GKO_ASSERT_MTX_NEAR(b, expect, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(b, expect); + ASSERT_TRUE(b->is_sorted_by_column_index()); } -TEST_F(Csr, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Csr, ApplyFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Csr, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(Csr, ApplyFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Csr, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(Csr, ApplyFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); +} - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + +TYPED_TEST(Csr, ConvertsToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Csr = typename TestFixture::Mtx; + using OtherCsr = gko::matrix::Csr; + auto tmp = OtherCsr::create(this->exec); + auto res = Csr::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + // use mtx2 as mtx's strategy would involve creating a CudaExecutor + this->mtx2->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual); + ASSERT_EQ(typeid(*this->mtx2->get_strategy()), + typeid(*res->get_strategy())); } -TEST_F(Csr, ConvertsToDense) +TYPED_TEST(Csr, MovesToPrecision) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); - auto dense_other = gko::initialize>( - 4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, exec); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Csr = typename TestFixture::Mtx; + using OtherCsr = gko::matrix::Csr; + auto tmp = OtherCsr::create(this->exec); + auto res = Csr::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + // use mtx2 as mtx's strategy would involve creating a CudaExecutor + this->mtx2->move_to(tmp.get()); + tmp->move_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual); + ASSERT_EQ(typeid(*this->mtx2->get_strategy()), + typeid(*res->get_strategy())); +} - mtx->convert_to(dense_mtx.get()); + +TYPED_TEST(Csr, ConvertsToDense) +{ + using Dense = typename TestFixture::Vec; + auto dense_mtx = Dense::create(this->mtx->get_executor()); + auto dense_other = gko::initialize( + 4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, this->exec); + + this->mtx->convert_to(dense_mtx.get()); GKO_ASSERT_MTX_NEAR(dense_mtx, dense_other, 0.0); } -TEST_F(Csr, MovesToDense) +TYPED_TEST(Csr, MovesToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); - auto dense_other = gko::initialize>( - 4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, exec); + using Dense = typename TestFixture::Vec; + auto dense_mtx = Dense::create(this->mtx->get_executor()); + auto dense_other = gko::initialize( + 4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, this->exec); - mtx->move_to(dense_mtx.get()); + this->mtx->move_to(dense_mtx.get()); GKO_ASSERT_MTX_NEAR(dense_mtx, dense_other, 0.0); } -TEST_F(Csr, ConvertsToCoo) +TYPED_TEST(Csr, ConvertsToCoo) { - auto coo_mtx = gko::matrix::Coo<>::create(mtx->get_executor()); + using Coo = typename TestFixture::Coo; + auto coo_mtx = Coo::create(this->mtx->get_executor()); - mtx->convert_to(coo_mtx.get()); + this->mtx->convert_to(coo_mtx.get()); - assert_equal_to_mtx(coo_mtx.get()); + this->assert_equal_to_mtx(coo_mtx.get()); } -TEST_F(Csr, MovesToCoo) +TYPED_TEST(Csr, MovesToCoo) { - auto coo_mtx = gko::matrix::Coo<>::create(mtx->get_executor()); + using Coo = typename TestFixture::Coo; + auto coo_mtx = Coo::create(this->mtx->get_executor()); - mtx->move_to(coo_mtx.get()); + this->mtx->move_to(coo_mtx.get()); - assert_equal_to_mtx(coo_mtx.get()); + this->assert_equal_to_mtx(coo_mtx.get()); } -TEST_F(Csr, ConvertsToSellp) +TYPED_TEST(Csr, ConvertsToSellp) { - auto sellp_mtx = gko::matrix::Sellp<>::create(mtx->get_executor()); + using Sellp = typename TestFixture::Sellp; + auto sellp_mtx = Sellp::create(this->mtx->get_executor()); - mtx->convert_to(sellp_mtx.get()); + this->mtx->convert_to(sellp_mtx.get()); - assert_equal_to_mtx(sellp_mtx.get()); + this->assert_equal_to_mtx(sellp_mtx.get()); } -TEST_F(Csr, MovesToSellp) +TYPED_TEST(Csr, MovesToSellp) { - auto sellp_mtx = gko::matrix::Sellp<>::create(mtx->get_executor()); - auto csr_ref = gko::matrix::Csr<>::create(mtx->get_executor()); + using Sellp = typename TestFixture::Sellp; + using Csr = typename TestFixture::Mtx; + auto sellp_mtx = Sellp::create(this->mtx->get_executor()); + auto csr_ref = Csr::create(this->mtx->get_executor()); - csr_ref->copy_from(mtx.get()); + csr_ref->copy_from(this->mtx.get()); csr_ref->move_to(sellp_mtx.get()); - assert_equal_to_mtx(sellp_mtx.get()); + this->assert_equal_to_mtx(sellp_mtx.get()); } -TEST_F(Csr, ConvertsToSparsityCsr) +TYPED_TEST(Csr, ConvertsToSparsityCsr) { - auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(mtx->get_executor()); + using SparsityCsr = typename TestFixture::SparsityCsr; + auto sparsity_mtx = SparsityCsr::create(this->mtx->get_executor()); - mtx->convert_to(sparsity_mtx.get()); + this->mtx->convert_to(sparsity_mtx.get()); - assert_equal_to_mtx(sparsity_mtx.get()); + this->assert_equal_to_mtx(sparsity_mtx.get()); } -TEST_F(Csr, MovesToSparsityCsr) +TYPED_TEST(Csr, MovesToSparsityCsr) { - auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(mtx->get_executor()); - auto csr_ref = gko::matrix::Csr<>::create(mtx->get_executor()); + using SparsityCsr = typename TestFixture::SparsityCsr; + using Csr = typename TestFixture::Mtx; + auto sparsity_mtx = SparsityCsr::create(this->mtx->get_executor()); + auto csr_ref = Csr::create(this->mtx->get_executor()); - csr_ref->copy_from(mtx.get()); + csr_ref->copy_from(this->mtx.get()); csr_ref->move_to(sparsity_mtx.get()); - assert_equal_to_mtx(sparsity_mtx.get()); + this->assert_equal_to_mtx(sparsity_mtx.get()); } -TEST_F(Csr, ConvertsToHybridAutomatically) +TYPED_TEST(Csr, ConvertsToHybridAutomatically) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx->get_executor()); + using Hybrid = typename TestFixture::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx->get_executor()); - mtx->convert_to(hybrid_mtx.get()); + this->mtx->convert_to(hybrid_mtx.get()); - assert_equal_to_mtx(hybrid_mtx.get()); + this->assert_equal_to_mtx(hybrid_mtx.get()); } -TEST_F(Csr, MovesToHybridAutomatically) +TYPED_TEST(Csr, MovesToHybridAutomatically) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx->get_executor()); - auto csr_ref = gko::matrix::Csr<>::create(mtx->get_executor()); + using Hybrid = typename TestFixture::Hybrid; + using Csr = typename TestFixture::Mtx; + auto hybrid_mtx = Hybrid::create(this->mtx->get_executor()); + auto csr_ref = Csr::create(this->mtx->get_executor()); - csr_ref->copy_from(mtx.get()); + csr_ref->copy_from(this->mtx.get()); csr_ref->move_to(hybrid_mtx.get()); - assert_equal_to_mtx(hybrid_mtx.get()); + this->assert_equal_to_mtx(hybrid_mtx.get()); } -TEST_F(Csr, ConvertsToHybridByColumn2) +TYPED_TEST(Csr, ConvertsToHybridByColumn2) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create( - mtx2->get_executor(), - std::make_shared::column_limit>(2)); + using Hybrid = typename TestFixture::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx2->get_executor(), + std::make_shared(2)); - mtx2->convert_to(hybrid_mtx.get()); + this->mtx2->convert_to(hybrid_mtx.get()); - assert_equal_to_mtx2(hybrid_mtx.get()); + this->assert_equal_to_mtx2(hybrid_mtx.get()); } -TEST_F(Csr, MovesToHybridByColumn2) +TYPED_TEST(Csr, MovesToHybridByColumn2) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create( - mtx2->get_executor(), - std::make_shared::column_limit>(2)); - auto csr_ref = gko::matrix::Csr<>::create(mtx2->get_executor()); - - csr_ref->copy_from(mtx2.get()); + using Hybrid = typename TestFixture::Hybrid; + using Csr = typename TestFixture::Mtx; + auto hybrid_mtx = + Hybrid::create(this->mtx2->get_executor(), + std::make_shared(2)); + auto csr_ref = Csr::create(this->mtx2->get_executor()); + + csr_ref->copy_from(this->mtx2.get()); csr_ref->move_to(hybrid_mtx.get()); - assert_equal_to_mtx2(hybrid_mtx.get()); + this->assert_equal_to_mtx2(hybrid_mtx.get()); +} + + +TYPED_TEST(Csr, ConvertsEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Csr = typename TestFixture::Mtx; + using OtherCsr = gko::matrix::Csr; + auto empty = OtherCsr::create(this->exec); + empty->get_row_ptrs()[0] = 0; + auto res = Csr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, MovesEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Csr = typename TestFixture::Mtx; + using OtherCsr = gko::matrix::Csr; + auto empty = OtherCsr::create(this->exec); + empty->get_row_ptrs()[0] = 0; + auto res = Csr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, ConvertsEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using Csr = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Csr::create(this->exec); + auto res = Dense::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, MovesEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using Csr = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Csr::create(this->exec); + auto res = Dense::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, ConvertsEmptyToCoo) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Coo = gko::matrix::Coo; + auto empty = Csr::create(this->exec); + auto res = Coo::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, MovesEmptyToCoo) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Coo = gko::matrix::Coo; + auto empty = Csr::create(this->exec); + auto res = Coo::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, ConvertsEmptyToEll) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Ell = gko::matrix::Ell; + auto empty = Csr::create(this->exec); + auto res = Ell::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, MovesEmptyToEll) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Ell = gko::matrix::Ell; + auto empty = Csr::create(this->exec); + auto res = Ell::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, ConvertsEmptyToSellp) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Sellp = gko::matrix::Sellp; + auto empty = Csr::create(this->exec); + auto res = Sellp::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, MovesEmptyToSellp) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Sellp = gko::matrix::Sellp; + auto empty = Csr::create(this->exec); + auto res = Sellp::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, ConvertsEmptyToSparsityCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using SparsityCsr = gko::matrix::SparsityCsr; + auto empty = Csr::create(this->exec); + empty->get_row_ptrs()[0] = 0; + auto res = SparsityCsr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_nonzeros(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); +} + + +TYPED_TEST(Csr, MovesEmptyToSparsityCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using SparsityCsr = gko::matrix::SparsityCsr; + auto empty = Csr::create(this->exec); + empty->get_row_ptrs()[0] = 0; + auto res = SparsityCsr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_nonzeros(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); +} + + +TYPED_TEST(Csr, ConvertsEmptyToHybrid) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Hybrid = gko::matrix::Hybrid; + auto empty = Csr::create(this->exec); + auto res = Hybrid::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Csr, MovesEmptyToHybrid) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Csr = typename TestFixture::Mtx; + using Hybrid = gko::matrix::Hybrid; + auto empty = Csr::create(this->exec); + auto res = Hybrid::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TEST_F(Csr, CalculatesNonzerosPerRow) +TYPED_TEST(Csr, CalculatesNonzerosPerRow) { - gko::Array row_nnz(exec, mtx->get_size()[0]); + gko::Array row_nnz(this->exec, this->mtx->get_size()[0]); - gko::kernels::reference::csr::calculate_nonzeros_per_row(exec, mtx.get(), - &row_nnz); + gko::kernels::reference::csr::calculate_nonzeros_per_row( + this->exec, this->mtx.get(), &row_nnz); auto row_nnz_val = row_nnz.get_data(); ASSERT_EQ(row_nnz_val[0], 3); @@ -575,54 +989,59 @@ TEST_F(Csr, CalculatesNonzerosPerRow) } -TEST_F(Csr, CalculatesTotalCols) +TYPED_TEST(Csr, CalculatesTotalCols) { gko::size_type total_cols; gko::size_type stride_factor = gko::matrix::default_stride_factor; gko::size_type slice_size = gko::matrix::default_slice_size; gko::kernels::reference::csr::calculate_total_cols( - exec, mtx.get(), &total_cols, stride_factor, slice_size); + this->exec, this->mtx.get(), &total_cols, stride_factor, slice_size); ASSERT_EQ(total_cols, 3); } -TEST_F(Csr, ConvertsToEll) +TYPED_TEST(Csr, ConvertsToEll) { - auto ell_mtx = gko::matrix::Ell<>::create(mtx->get_executor()); - auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); - auto ref_dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); + using Ell = typename TestFixture::Ell; + using Dense = typename TestFixture::Vec; + auto ell_mtx = Ell::create(this->mtx->get_executor()); + auto dense_mtx = Dense::create(this->mtx->get_executor()); + auto ref_dense_mtx = Dense::create(this->mtx->get_executor()); - mtx->convert_to(ell_mtx.get()); + this->mtx->convert_to(ell_mtx.get()); - assert_equal_to_mtx(ell_mtx.get()); + this->assert_equal_to_mtx(ell_mtx.get()); } -TEST_F(Csr, MovesToEll) +TYPED_TEST(Csr, MovesToEll) { - auto ell_mtx = gko::matrix::Ell<>::create(mtx->get_executor()); - auto dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); - auto ref_dense_mtx = gko::matrix::Dense<>::create(mtx->get_executor()); + using Ell = typename TestFixture::Ell; + using Dense = typename TestFixture::Vec; + auto ell_mtx = Ell::create(this->mtx->get_executor()); + auto dense_mtx = Dense::create(this->mtx->get_executor()); + auto ref_dense_mtx = Dense::create(this->mtx->get_executor()); - mtx->move_to(ell_mtx.get()); + this->mtx->move_to(ell_mtx.get()); - assert_equal_to_mtx(ell_mtx.get()); + this->assert_equal_to_mtx(ell_mtx.get()); } -TEST_F(Csr, SquareMtxIsTransposable) +TYPED_TEST(Csr, SquareMtxIsTransposable) { + using Csr = typename TestFixture::Mtx; // clang-format off - auto mtx2 = gko::initialize>( + auto mtx2 = gko::initialize( {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}, - {0.0, 1.5, 2.0}}, exec); + {0.0, 1.5, 2.0}}, this->exec); // clang-format on auto trans = mtx2->transpose(); - auto trans_as_csr = static_cast *>(trans.get()); + auto trans_as_csr = static_cast(trans.get()); // clang-format off GKO_ASSERT_MTX_NEAR(trans_as_csr, @@ -633,10 +1052,11 @@ TEST_F(Csr, SquareMtxIsTransposable) } -TEST_F(Csr, NonSquareMtxIsTransposable) +TYPED_TEST(Csr, NonSquareMtxIsTransposable) { - auto trans = mtx->transpose(); - auto trans_as_csr = static_cast *>(trans.get()); + using Csr = typename TestFixture::Mtx; + auto trans = this->mtx->transpose(); + auto trans_as_csr = static_cast(trans.get()); // clang-format off GKO_ASSERT_MTX_NEAR(trans_as_csr, @@ -647,59 +1067,266 @@ TEST_F(Csr, NonSquareMtxIsTransposable) } -TEST_F(Csr, MtxIsConjugateTransposable) +TYPED_TEST(Csr, SquareMatrixIsRowPermutable) { + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; // clang-format off - auto mtx2 = gko::initialize>>( - {{1.0 + 2.0 * i, 3.0 + 0.0 * i, 2.0 + 0.0 * i}, - {0.0 + 0.0 * i, 5.0 - 3.5 * i, 0.0 + 0.0 * i}, - {0.0 + 0.0 * i, 0.0 + 1.5 * i, 2.0 + 0.0 * i}}, exec); + auto p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}, + {0.0, 1.5, 2.0}}, this->exec); // clang-format on + gko::Array permute_idxs{this->exec, {1, 2, 0}}; - auto trans = mtx2->conj_transpose(); - auto trans_as_csr = - static_cast> *>(trans.get()); + auto row_permute = p_mtx->row_permute(&permute_idxs); + auto row_permute_csr = static_cast(row_permute.get()); // clang-format off - GKO_ASSERT_MTX_NEAR(trans_as_csr, - l({{1.0 - 2.0 * i, 0.0 + 0.0 * i, 0.0 + 0.0 * i}, - {3.0 + 0.0 * i, 5.0 + 3.5 * i, 0.0 - 1.5 * i}, - {2.0 + 0.0 * i, 0.0 + 0.0 * i, 2.0 + 0.0 * i}}), 0.0); + GKO_ASSERT_MTX_NEAR(row_permute_csr, + l({{0.0, 5.0, 0.0}, + {0.0, 1.5, 2.0}, + {1.0, 3.0, 2.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, NonSquareMatrixIsRowPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}}, this->exec); + // clang-format on + gko::Array permute_idxs{this->exec, {1, 0}}; + + auto row_permute = p_mtx->row_permute(&permute_idxs); + + auto row_permute_csr = static_cast(row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(row_permute_csr, + l({{0.0, 5.0, 0.0}, + {1.0, 3.0, 2.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, SquareMatrixIsColPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}, + {0.0, 1.5, 2.0}}, this->exec); + // clang-format on + gko::Array permute_idxs{this->exec, {1, 2, 0}}; + + auto c_permute = p_mtx->column_permute(&permute_idxs); + + auto c_permute_csr = static_cast(c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(c_permute_csr, + l({{3.0, 2.0, 1.0}, + {5.0, 0.0, 0.0}, + {1.5, 2.0, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, NonSquareMatrixIsColPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto p_mtx = gko::initialize({{1.0, 0.0, 2.0}, + {0.0, 5.0, 0.0}}, this->exec); + // clang-format on + gko::Array permute_idxs{this->exec, {1, 2, 0}}; + + auto c_permute = p_mtx->column_permute(&permute_idxs); + + auto c_permute_csr = static_cast(c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(c_permute_csr, + l({{0.0, 2.0, 1.0}, + {5.0, 0.0, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, SquareMatrixIsInverseRowPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto inverse_p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}, + {0.0, 1.5, 2.0}}, this->exec); + // clang-format on + gko::Array inverse_permute_idxs{this->exec, {1, 2, 0}}; + + auto inverse_row_permute = + inverse_p_mtx->inverse_row_permute(&inverse_permute_idxs); + + auto inverse_row_permute_csr = + static_cast(inverse_row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_row_permute_csr, + l({{0.0, 1.5, 2.0}, + {1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, NonSquareMatrixIsInverseRowPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto inverse_p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}}, this->exec); + // clang-format on + gko::Array inverse_permute_idxs{this->exec, {1, 0}}; + + auto inverse_row_permute = + inverse_p_mtx->inverse_row_permute(&inverse_permute_idxs); + + auto inverse_row_permute_csr = + static_cast(inverse_row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_row_permute_csr, + l({{0.0, 5.0, 0.0}, + {1.0, 3.0, 2.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, SquareMatrixIsInverseColPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto inverse_p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}, + {0.0, 1.5, 2.0}}, this->exec); + // clang-format on + gko::Array inverse_permute_idxs{this->exec, {1, 2, 0}}; + + auto inverse_c_permute = + inverse_p_mtx->inverse_column_permute(&inverse_permute_idxs); + + auto inverse_c_permute_csr = static_cast(inverse_c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_c_permute_csr, + l({{2.0, 1.0, 3.0}, + {0.0, 0.0, 5.0}, + {2.0, 0.0, 1.5}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Csr, NonSquareMatrixIsInverseColPermutable) +{ + using Csr = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + // clang-format off + auto inverse_p_mtx = gko::initialize({{1.0, 3.0, 2.0}, + {0.0, 5.0, 0.0}}, this->exec); + // clang-format on + gko::Array inverse_permute_idxs{this->exec, {1, 2, 0}}; + + auto inverse_c_permute = + inverse_p_mtx->inverse_column_permute(&inverse_permute_idxs); + + auto inverse_c_permute_csr = static_cast(inverse_c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_c_permute_csr, + l({{2.0, 1.0, 3.0}, + {0.0, 0.0, 5.0}}), + 0.0); // clang-format on } -TEST_F(Csr, RecognizeSortedMatrix) +TYPED_TEST(Csr, RecognizeSortedMatrix) { - ASSERT_TRUE(mtx->is_sorted_by_column_index()); - ASSERT_TRUE(mtx2->is_sorted_by_column_index()); - ASSERT_TRUE(mtx3_sorted->is_sorted_by_column_index()); + ASSERT_TRUE(this->mtx->is_sorted_by_column_index()); + ASSERT_TRUE(this->mtx2->is_sorted_by_column_index()); + ASSERT_TRUE(this->mtx3_sorted->is_sorted_by_column_index()); } -TEST_F(Csr, RecognizeUnsortedMatrix) +TYPED_TEST(Csr, RecognizeUnsortedMatrix) { - ASSERT_FALSE(mtx3_unsorted->is_sorted_by_column_index()); + ASSERT_FALSE(this->mtx3_unsorted->is_sorted_by_column_index()); } -TEST_F(Csr, SortSortedMatrix) +TYPED_TEST(Csr, SortSortedMatrix) { - auto matrix = mtx3_sorted->clone(); + auto matrix = this->mtx3_sorted->clone(); matrix->sort_by_column_index(); - GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0); + GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0); } -TEST_F(Csr, SortUnsortedMatrix) +TYPED_TEST(Csr, SortUnsortedMatrix) { - auto matrix = mtx3_unsorted->clone(); + auto matrix = this->mtx3_unsorted->clone(); matrix->sort_by_column_index(); - GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0); + GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0); +} + + +template +class CsrComplex : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Csr; +}; + +TYPED_TEST_CASE(CsrComplex, gko::test::ComplexValueIndexTypes); + + +TYPED_TEST(CsrComplex, MtxIsConjugateTransposable) +{ + using Csr = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + + auto exec = gko::ReferenceExecutor::create(); + // clang-format off + auto mtx2 = gko::initialize( + {{T{1.0, 2.0}, T{3.0, 0.0}, T{2.0, 0.0}}, + {T{0.0, 0.0}, T{5.0, - 3.5}, T{0.0,0.0}}, + {T{0.0, 0.0}, T{0.0, 1.5}, T{2.0,0.0}}}, exec); + // clang-format on + + auto trans = mtx2->conj_transpose(); + auto trans_as_csr = static_cast(trans.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(trans_as_csr, + l({{T{1.0, - 2.0}, T{0.0, 0.0}, T{0.0, 0.0}}, + {T{3.0, 0.0}, T{5.0, 3.5}, T{0.0, - 1.5}}, + {T{2.0, 0.0}, T{0.0, 0.0}, T{2.0 + 0.0}}}), 0.0); + // clang-format on } diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index ce24a8ef42a..e851d9d0dfd 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/dense_kernels.hpp" +#include #include @@ -43,54 +43,51 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include -#include #include #include #include #include +#include "core/matrix/dense_kernels.hpp" #include "core/test/utils.hpp" namespace { +template class Dense : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = T; + using Mtx = gko::matrix::Dense; Dense() : exec(gko::ReferenceExecutor::create()), mtx1(gko::initialize(4, {{1.0, 2.0, 3.0}, {1.5, 2.5, 3.5}}, exec)), - mtx2(gko::initialize({{1.0, -1.0}, {-2.0, 2.0}}, exec)), + mtx2(gko::initialize({I({1.0, -1.0}), I({-2.0, 2.0})}, + exec)), mtx3(gko::initialize(4, {{1.0, 2.0, 3.0}, {0.5, 1.5, 2.5}}, exec)), mtx4(gko::initialize(4, {{1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}, exec)), mtx5(gko::initialize( {{1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}, exec)), - mtx6(gko::initialize>>( - {{1.0 + 2.0 * i, -1.0 + 2.1 * i}, - {-2.0 + 1.5 * i, 4.5 + 0.0 * i}, - {1.0 + 0.0 * i, i}}, - exec)), - mtx7(gko::initialize({{1.0, 2.0, 0.0}, {0.0, 1.5, 0.0}}, exec)), - mtx8(gko::initialize({{1.0, 2.0, 3.0}, {0.0, 1.5, 0.0}}, exec)) + mtx6(gko::initialize({{1.0, 2.0, 0.0}, {0.0, 1.5, 0.0}}, exec)), + mtx7(gko::initialize({{1.0, 2.0, 3.0}, {0.0, 1.5, 0.0}}, exec)) {} - std::complex i{0, 1}; std::shared_ptr exec; - std::unique_ptr> mtx1; - std::unique_ptr> mtx2; - std::unique_ptr> mtx3; - std::unique_ptr> mtx4; - std::unique_ptr> mtx5; - std::unique_ptr>> mtx6; - std::unique_ptr> mtx7; - std::unique_ptr> mtx8; + std::unique_ptr mtx1; + std::unique_ptr mtx2; + std::unique_ptr mtx3; + std::unique_ptr mtx4; + std::unique_ptr mtx5; + std::unique_ptr mtx6; + std::unique_ptr mtx7; std::ranlux48 rand_engine; @@ -99,192 +96,324 @@ class Dense : public ::testing::Test { { return gko::test::generate_random_matrix( num_rows, num_cols, - std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution<>(0.0, 1.0), rand_engine, exec); + std::uniform_int_distribution(num_cols, num_cols), + std::normal_distribution>(0.0, 1.0), + rand_engine, exec); } }; -TEST_F(Dense, AppliesToDense) -{ - mtx2->apply(mtx1.get(), mtx3.get()); +TYPED_TEST_CASE(Dense, gko::test::ValueTypes); + - EXPECT_EQ(mtx3->at(0, 0), -0.5); - EXPECT_EQ(mtx3->at(0, 1), -0.5); - EXPECT_EQ(mtx3->at(0, 2), -0.5); - EXPECT_EQ(mtx3->at(1, 0), 1.0); - EXPECT_EQ(mtx3->at(1, 1), 1.0); - ASSERT_EQ(mtx3->at(1, 2), 1.0); +TYPED_TEST(Dense, AppliesToDense) +{ + using T = typename TestFixture::value_type; + this->mtx2->apply(this->mtx1.get(), this->mtx3.get()); + + EXPECT_EQ(this->mtx3->at(0, 0), T{-0.5}); + EXPECT_EQ(this->mtx3->at(0, 1), T{-0.5}); + EXPECT_EQ(this->mtx3->at(0, 2), T{-0.5}); + EXPECT_EQ(this->mtx3->at(1, 0), T{1.0}); + EXPECT_EQ(this->mtx3->at(1, 1), T{1.0}); + ASSERT_EQ(this->mtx3->at(1, 2), T{1.0}); } -TEST_F(Dense, AppliesLinearCombinationToDense) +TYPED_TEST(Dense, AppliesLinearCombinationToDense) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + + this->mtx2->apply(alpha.get(), this->mtx1.get(), beta.get(), + this->mtx3.get()); + + EXPECT_EQ(this->mtx3->at(0, 0), T{2.5}); + EXPECT_EQ(this->mtx3->at(0, 1), T{4.5}); + EXPECT_EQ(this->mtx3->at(0, 2), T{6.5}); + EXPECT_EQ(this->mtx3->at(1, 0), T{0.0}); + EXPECT_EQ(this->mtx3->at(1, 1), T{2.0}); + ASSERT_EQ(this->mtx3->at(1, 2), T{4.0}); +} + - mtx2->apply(alpha.get(), mtx1.get(), beta.get(), mtx3.get()); +TYPED_TEST(Dense, ApplyFailsOnWrongInnerDimension) +{ + using Mtx = typename TestFixture::Mtx; + auto res = Mtx::create(this->exec, gko::dim<2>{2}); - EXPECT_EQ(mtx3->at(0, 0), 2.5); - EXPECT_EQ(mtx3->at(0, 1), 4.5); - EXPECT_EQ(mtx3->at(0, 2), 6.5); - EXPECT_EQ(mtx3->at(1, 0), 0.0); - EXPECT_EQ(mtx3->at(1, 1), 2.0); - ASSERT_EQ(mtx3->at(1, 2), 4.0); + ASSERT_THROW(this->mtx2->apply(this->mtx1.get(), res.get()), + gko::DimensionMismatch); } -TEST_F(Dense, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfRows) { - auto res = gko::matrix::Dense<>::create(exec, gko::dim<2>{2}); + using Mtx = typename TestFixture::Mtx; + auto res = Mtx::create(this->exec, gko::dim<2>{3}); - ASSERT_THROW(mtx2->apply(mtx1.get(), res.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(this->mtx2.get(), res.get()), + gko::DimensionMismatch); } -TEST_F(Dense, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfCols) { - auto res = gko::matrix::Dense<>::create(exec, gko::dim<2>{3}); + using Mtx = typename TestFixture::Mtx; + auto res = Mtx::create(this->exec, gko::dim<2>{2}, 3); - ASSERT_THROW(mtx1->apply(mtx2.get(), res.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(this->mtx2.get(), res.get()), + gko::DimensionMismatch); } -TEST_F(Dense, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(Dense, ScalesData) { - auto res = gko::matrix::Dense<>::create(exec, gko::dim<2>{2}, 3); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({I{2.0, -2.0}}, this->exec); + + this->mtx2->scale(alpha.get()); - ASSERT_THROW(mtx1->apply(mtx2.get(), res.get()), gko::DimensionMismatch); + EXPECT_EQ(this->mtx2->at(0, 0), T{2.0}); + EXPECT_EQ(this->mtx2->at(0, 1), T{2.0}); + EXPECT_EQ(this->mtx2->at(1, 0), T{-4.0}); + EXPECT_EQ(this->mtx2->at(1, 1), T{-4.0}); } -TEST_F(Dense, ScalesData) +TYPED_TEST(Dense, ScalesDataWithScalar) { - auto alpha = gko::initialize({{2.0, -2.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({2.0}, this->exec); - mtx2->scale(alpha.get()); + this->mtx2->scale(alpha.get()); - EXPECT_EQ(mtx2->at(0, 0), 2.0); - EXPECT_EQ(mtx2->at(0, 1), 2.0); - EXPECT_EQ(mtx2->at(1, 0), -4.0); - EXPECT_EQ(mtx2->at(1, 1), -4.0); + EXPECT_EQ(this->mtx2->at(0, 0), T{2.0}); + EXPECT_EQ(this->mtx2->at(0, 1), T{-2.0}); + EXPECT_EQ(this->mtx2->at(1, 0), T{-4.0}); + EXPECT_EQ(this->mtx2->at(1, 1), T{4.0}); } -TEST_F(Dense, ScalesDataWithScalar) +TYPED_TEST(Dense, ScalesDataWithStride) { - auto alpha = gko::initialize({2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({{-1.0, 1.0, 2.0}}, this->exec); + + this->mtx1->scale(alpha.get()); + + EXPECT_EQ(this->mtx1->at(0, 0), T{-1.0}); + EXPECT_EQ(this->mtx1->at(0, 1), T{2.0}); + EXPECT_EQ(this->mtx1->at(0, 2), T{6.0}); + EXPECT_EQ(this->mtx1->at(1, 0), T{-1.5}); + EXPECT_EQ(this->mtx1->at(1, 1), T{2.5}); + ASSERT_EQ(this->mtx1->at(1, 2), T{7.0}); +} - mtx2->scale(alpha.get()); - EXPECT_EQ(mtx2->at(0, 0), 2.0); - EXPECT_EQ(mtx2->at(0, 1), -2.0); - EXPECT_EQ(mtx2->at(1, 0), -4.0); - EXPECT_EQ(mtx2->at(1, 1), 4.0); +TYPED_TEST(Dense, AddsScaled) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({{2.0, 1.0, -2.0}}, this->exec); + + this->mtx1->add_scaled(alpha.get(), this->mtx3.get()); + + EXPECT_EQ(this->mtx1->at(0, 0), T{3.0}); + EXPECT_EQ(this->mtx1->at(0, 1), T{4.0}); + EXPECT_EQ(this->mtx1->at(0, 2), T{-3.0}); + EXPECT_EQ(this->mtx1->at(1, 0), T{2.5}); + EXPECT_EQ(this->mtx1->at(1, 1), T{4.0}); + ASSERT_EQ(this->mtx1->at(1, 2), T{-1.5}); } -TEST_F(Dense, ScalesDataWithStride) +TYPED_TEST(Dense, AddsScaledWithScalar) { - auto alpha = gko::initialize({{-1.0, 1.0, 2.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({2.0}, this->exec); + + this->mtx1->add_scaled(alpha.get(), this->mtx3.get()); + + EXPECT_EQ(this->mtx1->at(0, 0), T{3.0}); + EXPECT_EQ(this->mtx1->at(0, 1), T{6.0}); + EXPECT_EQ(this->mtx1->at(0, 2), T{9.0}); + EXPECT_EQ(this->mtx1->at(1, 0), T{2.5}); + EXPECT_EQ(this->mtx1->at(1, 1), T{5.5}); + ASSERT_EQ(this->mtx1->at(1, 2), T{8.5}); +} + - mtx1->scale(alpha.get()); +TYPED_TEST(Dense, AddScaledFailsOnWrongSizes) +{ + using Mtx = typename TestFixture::Mtx; + auto alpha = Mtx::create(this->exec, gko::dim<2>{1, 2}); - EXPECT_EQ(mtx1->at(0, 0), -1.0); - EXPECT_EQ(mtx1->at(0, 1), 2.0); - EXPECT_EQ(mtx1->at(0, 2), 6.0); - EXPECT_EQ(mtx1->at(1, 0), -1.5); - EXPECT_EQ(mtx1->at(1, 1), 2.5); - ASSERT_EQ(mtx1->at(1, 2), 7.0); + ASSERT_THROW(this->mtx1->add_scaled(alpha.get(), this->mtx2.get()), + gko::DimensionMismatch); } -TEST_F(Dense, AddsScaled) +TYPED_TEST(Dense, ComputesDot) { - auto alpha = gko::initialize({{2.0, 1.0, -2.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto result = Mtx::create(this->exec, gko::dim<2>{1, 3}); - mtx1->add_scaled(alpha.get(), mtx3.get()); + this->mtx1->compute_dot(this->mtx3.get(), result.get()); - EXPECT_EQ(mtx1->at(0, 0), 3.0); - EXPECT_EQ(mtx1->at(0, 1), 4.0); - EXPECT_EQ(mtx1->at(0, 2), -3.0); - EXPECT_EQ(mtx1->at(1, 0), 2.5); - EXPECT_EQ(mtx1->at(1, 1), 4.0); - ASSERT_EQ(mtx1->at(1, 2), -1.5); + EXPECT_EQ(result->at(0, 0), T{1.75}); + EXPECT_EQ(result->at(0, 1), T{7.75}); + ASSERT_EQ(result->at(0, 2), T{17.75}); } -TEST_F(Dense, AddsScaledWithScalar) +TYPED_TEST(Dense, ComputesNorm2) { - auto alpha = gko::initialize({2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using T_nc = gko::remove_complex; + using NormVector = gko::matrix::Dense; + auto mtx(gko::initialize( + {I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, this->exec)); + auto result = NormVector::create(this->exec, gko::dim<2>{1, 2}); - mtx1->add_scaled(alpha.get(), mtx3.get()); + mtx->compute_norm2(result.get()); - EXPECT_EQ(mtx1->at(0, 0), 3.0); - EXPECT_EQ(mtx1->at(0, 1), 6.0); - EXPECT_EQ(mtx1->at(0, 2), 9.0); - EXPECT_EQ(mtx1->at(1, 0), 2.5); - EXPECT_EQ(mtx1->at(1, 1), 5.5); - ASSERT_EQ(mtx1->at(1, 2), 8.5); + EXPECT_EQ(result->at(0, 0), T_nc{3.0}); + EXPECT_EQ(result->at(0, 1), T_nc{5.0}); } -TEST_F(Dense, AddScaledFailsOnWrongSizes) +TYPED_TEST(Dense, ComputDotFailsOnWrongInputSize) { - auto alpha = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 2}); + using Mtx = typename TestFixture::Mtx; + auto result = Mtx::create(this->exec, gko::dim<2>{1, 3}); - ASSERT_THROW(mtx1->add_scaled(alpha.get(), mtx2.get()), + ASSERT_THROW(this->mtx1->compute_dot(this->mtx2.get(), result.get()), gko::DimensionMismatch); } -TEST_F(Dense, ComputesDot) +TYPED_TEST(Dense, ComputDotFailsOnWrongResultSize) { - auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 3}); - - mtx1->compute_dot(mtx3.get(), result.get()); + using Mtx = typename TestFixture::Mtx; + auto result = Mtx::create(this->exec, gko::dim<2>{1, 2}); - EXPECT_EQ(result->at(0, 0), 1.75); - EXPECT_EQ(result->at(0, 1), 7.75); - ASSERT_EQ(result->at(0, 2), 17.75); + ASSERT_THROW(this->mtx1->compute_dot(this->mtx3.get(), result.get()), + gko::DimensionMismatch); } -TEST_F(Dense, ComputesNorm2) +TYPED_TEST(Dense, ConvertsToPrecision) { - auto mtx(gko::initialize({{1.0, 0.0}, {2.0, 3.0}, {2.0, 4.0}}, exec)); - auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 2}); + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherDense = typename gko::matrix::Dense; + auto tmp = OtherDense::create(this->exec); + auto res = Dense::create(this->exec); + // If OtherT is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); +} - mtx->compute_norm2(result.get()); - EXPECT_EQ(result->at(0, 0), 3.0); - EXPECT_EQ(result->at(0, 1), 5.0); +TYPED_TEST(Dense, MovesToPrecision) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherDense = typename gko::matrix::Dense; + auto tmp = OtherDense::create(this->exec); + auto res = Dense::create(this->exec); + // If OtherT is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->move_to(tmp.get()); + tmp->move_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); } -TEST_F(Dense, ComputDotFailsOnWrongInputSize) +TYPED_TEST(Dense, ConvertsToCoo32) { - auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 3}); + using T = typename TestFixture::value_type; + using Coo = typename gko::matrix::Coo; + auto coo_mtx = Coo::create(this->mtx4->get_executor()); - ASSERT_THROW(mtx1->compute_dot(mtx2.get(), result.get()), - gko::DimensionMismatch); + this->mtx4->convert_to(coo_mtx.get()); + auto v = coo_mtx->get_const_values(); + auto c = coo_mtx->get_const_col_idxs(); + auto r = coo_mtx->get_const_row_idxs(); + + ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, ComputDotFailsOnWrongResultSize) +TYPED_TEST(Dense, MovesToCoo32) { - auto result = gko::matrix::Dense<>::create(exec, gko::dim<2>{1, 2}); + using T = typename TestFixture::value_type; + using Coo = typename gko::matrix::Coo; + auto coo_mtx = Coo::create(this->mtx4->get_executor()); - ASSERT_THROW(mtx1->compute_dot(mtx3.get(), result.get()), - gko::DimensionMismatch); + this->mtx4->move_to(coo_mtx.get()); + auto v = coo_mtx->get_const_values(); + auto c = coo_mtx->get_const_col_idxs(); + auto r = coo_mtx->get_const_row_idxs(); + + ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, ConvertsToCoo) +TYPED_TEST(Dense, ConvertsToCoo64) { - auto coo_mtx = gko::matrix::Coo<>::create(mtx4->get_executor()); + using T = typename TestFixture::value_type; + using Coo = typename gko::matrix::Coo; + auto coo_mtx = Coo::create(this->mtx4->get_executor()); - mtx4->convert_to(coo_mtx.get()); + this->mtx4->convert_to(coo_mtx.get()); auto v = coo_mtx->get_const_values(); auto c = coo_mtx->get_const_col_idxs(); auto r = coo_mtx->get_const_row_idxs(); @@ -299,18 +428,20 @@ TEST_F(Dense, ConvertsToCoo) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, MovesToCoo) +TYPED_TEST(Dense, MovesToCoo64) { - auto coo_mtx = gko::matrix::Coo<>::create(mtx4->get_executor()); + using T = typename TestFixture::value_type; + using Coo = typename gko::matrix::Coo; + auto coo_mtx = Coo::create(this->mtx4->get_executor()); - mtx4->move_to(coo_mtx.get()); + this->mtx4->move_to(coo_mtx.get()); auto v = coo_mtx->get_const_values(); auto c = coo_mtx->get_const_col_idxs(); auto r = coo_mtx->get_const_row_idxs(); @@ -325,50 +456,93 @@ TEST_F(Dense, MovesToCoo) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, ConvertsEmptyMatrixToCsr) +TYPED_TEST(Dense, ConvertsToCsr32) { - auto strategy = std::make_shared::load_balance>(0); - auto from_mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{0, 0}); - auto to_mtx = - gko::matrix::Csr<>::create(exec, gko::dim<2>{0, 0}, 0, strategy); + using T = typename TestFixture::value_type; + using Csr = typename gko::matrix::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - from_mtx->convert_to(to_mtx.get()); + this->mtx4->convert_to(csr_mtx_c.get()); + this->mtx4->convert_to(csr_mtx_m.get()); - ASSERT_FALSE(to_mtx->get_size()); + auto v = csr_mtx_c->get_const_values(); + auto c = csr_mtx_c->get_const_col_idxs(); + auto r = csr_mtx_c->get_const_row_ptrs(); + ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 4); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Dense, MovesEmptyMatrixToCsr) +TYPED_TEST(Dense, MovesToCsr32) { - auto strategy = std::make_shared::load_balance>(0); - auto from_mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{0, 0}); - auto to_mtx = - gko::matrix::Csr<>::create(exec, gko::dim<2>{0, 0}, 0, strategy); - - from_mtx->move_to(to_mtx.get()); + using T = typename TestFixture::value_type; + using Csr = typename gko::matrix::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx4->clone(); + + this->mtx4->move_to(csr_mtx_c.get()); + mtx_clone->move_to(csr_mtx_m.get()); - ASSERT_FALSE(to_mtx->get_size()); + auto v = csr_mtx_c->get_const_values(); + auto c = csr_mtx_c->get_const_col_idxs(); + auto r = csr_mtx_c->get_const_row_ptrs(); + ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 4); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Dense, ConvertsToCsr) +TYPED_TEST(Dense, ConvertsToCsr64) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_merge); + using T = typename TestFixture::value_type; + using Csr = typename gko::matrix::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - mtx4->convert_to(csr_mtx_c.get()); - mtx4->convert_to(csr_mtx_m.get()); + this->mtx4->convert_to(csr_mtx_c.get()); + this->mtx4->convert_to(csr_mtx_m.get()); auto v = csr_mtx_c->get_const_values(); auto c = csr_mtx_c->get_const_col_idxs(); @@ -382,27 +556,27 @@ TEST_F(Dense, ConvertsToCsr) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Dense, MovesToCsr) +TYPED_TEST(Dense, MovesToCsr64) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_merge); - auto mtx_clone = mtx4->clone(); - - mtx4->move_to(csr_mtx_c.get()); + using T = typename TestFixture::value_type; + using Csr = typename gko::matrix::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx4->clone(); + + this->mtx4->move_to(csr_mtx_c.get()); mtx_clone->move_to(csr_mtx_m.get()); auto v = csr_mtx_c->get_const_values(); @@ -417,22 +591,23 @@ TEST_F(Dense, MovesToCsr) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Dense, ConvertsToSparsityCsr) +TYPED_TEST(Dense, ConvertsToSparsityCsr32) { - auto sparsity_csr_mtx = - gko::matrix::SparsityCsr<>::create(mtx4->get_executor()); + using T = typename TestFixture::value_type; + using SparsityCsr = typename gko::matrix::SparsityCsr; + auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - mtx4->convert_to(sparsity_csr_mtx.get()); + this->mtx4->convert_to(sparsity_csr_mtx.get()); auto v = sparsity_csr_mtx->get_const_value(); auto c = sparsity_csr_mtx->get_const_col_idxs(); auto r = sparsity_csr_mtx->get_const_row_ptrs(); @@ -446,16 +621,17 @@ TEST_F(Dense, ConvertsToSparsityCsr) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); + EXPECT_EQ(v[0], T{1.0}); } -TEST_F(Dense, MovesToSparsityCsr) +TYPED_TEST(Dense, MovesToSparsityCsr32) { - auto sparsity_csr_mtx = - gko::matrix::SparsityCsr<>::create(mtx4->get_executor()); + using T = typename TestFixture::value_type; + using SparsityCsr = typename gko::matrix::SparsityCsr; + auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - mtx4->move_to(sparsity_csr_mtx.get()); + this->mtx4->move_to(sparsity_csr_mtx.get()); auto v = sparsity_csr_mtx->get_const_value(); auto c = sparsity_csr_mtx->get_const_col_idxs(); auto r = sparsity_csr_mtx->get_const_row_ptrs(); @@ -469,15 +645,115 @@ TEST_F(Dense, MovesToSparsityCsr) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); + EXPECT_EQ(v[0], T{1.0}); } -TEST_F(Dense, ConvertsToEll) +TYPED_TEST(Dense, ConvertsToSparsityCsr64) { - auto ell_mtx = gko::matrix::Ell<>::create(mtx7->get_executor()); + using T = typename TestFixture::value_type; + using SparsityCsr = typename gko::matrix::SparsityCsr; + auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); + + this->mtx4->convert_to(sparsity_csr_mtx.get()); + auto v = sparsity_csr_mtx->get_const_value(); + auto c = sparsity_csr_mtx->get_const_col_idxs(); + auto r = sparsity_csr_mtx->get_const_row_ptrs(); + + ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 4); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); +} - mtx7->convert_to(ell_mtx.get()); + +TYPED_TEST(Dense, MovesToSparsityCsr64) +{ + using T = typename TestFixture::value_type; + using SparsityCsr = typename gko::matrix::SparsityCsr; + auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); + + this->mtx4->move_to(sparsity_csr_mtx.get()); + auto v = sparsity_csr_mtx->get_const_value(); + auto c = sparsity_csr_mtx->get_const_col_idxs(); + auto r = sparsity_csr_mtx->get_const_row_ptrs(); + + ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 4); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); +} + + +TYPED_TEST(Dense, ConvertsToEll32) +{ + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor()); + + this->mtx6->convert_to(ell_mtx.get()); + auto v = ell_mtx->get_const_values(); + auto c = ell_mtx->get_const_col_idxs(); + + ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); + ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); + ASSERT_EQ(ell_mtx->get_stride(), 2); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 1); + EXPECT_EQ(c[3], 0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{0.0}); +} + + +TYPED_TEST(Dense, MovesToEll32) +{ + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor()); + + this->mtx6->move_to(ell_mtx.get()); + auto v = ell_mtx->get_const_values(); + auto c = ell_mtx->get_const_col_idxs(); + + ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); + ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); + ASSERT_EQ(ell_mtx->get_stride(), 2); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 1); + EXPECT_EQ(c[3], 0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{0.0}); +} + + +TYPED_TEST(Dense, ConvertsToEll64) +{ + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor()); + + this->mtx6->convert_to(ell_mtx.get()); auto v = ell_mtx->get_const_values(); auto c = ell_mtx->get_const_col_idxs(); @@ -489,18 +765,20 @@ TEST_F(Dense, ConvertsToEll) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 1); EXPECT_EQ(c[3], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{0.0}); } -TEST_F(Dense, MovesToEll) +TYPED_TEST(Dense, MovesToEll64) { - auto ell_mtx = gko::matrix::Ell<>::create(mtx7->get_executor()); + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor()); - mtx7->move_to(ell_mtx.get()); + this->mtx6->move_to(ell_mtx.get()); auto v = ell_mtx->get_const_values(); auto c = ell_mtx->get_const_col_idxs(); @@ -512,19 +790,20 @@ TEST_F(Dense, MovesToEll) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 1); EXPECT_EQ(c[3], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{0.0}); } -TEST_F(Dense, ConvertsToEllWithStride) +TYPED_TEST(Dense, ConvertsToEllWithStride) { - auto ell_mtx = - gko::matrix::Ell<>::create(mtx7->get_executor(), gko::dim<2>{}, 0, 3); + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{}, 0, 3); - mtx7->convert_to(ell_mtx.get()); + this->mtx6->convert_to(ell_mtx.get()); auto v = ell_mtx->get_const_values(); auto c = ell_mtx->get_const_col_idxs(); @@ -538,21 +817,22 @@ TEST_F(Dense, ConvertsToEllWithStride) EXPECT_EQ(c[3], 1); EXPECT_EQ(c[4], 0); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[2], 0.0); - EXPECT_EQ(v[3], 2.0); - EXPECT_EQ(v[4], 0.0); - EXPECT_EQ(v[5], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{0.0}); + EXPECT_EQ(v[3], T{2.0}); + EXPECT_EQ(v[4], T{0.0}); + EXPECT_EQ(v[5], T{0.0}); } -TEST_F(Dense, MovesToEllWithStride) +TYPED_TEST(Dense, MovesToEllWithStride) { - auto ell_mtx = - gko::matrix::Ell<>::create(mtx7->get_executor(), gko::dim<2>{}, 0, 3); + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{}, 0, 3); - mtx7->move_to(ell_mtx.get()); + this->mtx6->move_to(ell_mtx.get()); auto v = ell_mtx->get_const_values(); auto c = ell_mtx->get_const_col_idxs(); @@ -566,20 +846,88 @@ TEST_F(Dense, MovesToEllWithStride) EXPECT_EQ(c[3], 1); EXPECT_EQ(c[4], 0); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[2], 0.0); - EXPECT_EQ(v[3], 2.0); - EXPECT_EQ(v[4], 0.0); - EXPECT_EQ(v[5], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{0.0}); + EXPECT_EQ(v[3], T{2.0}); + EXPECT_EQ(v[4], T{0.0}); + EXPECT_EQ(v[5], T{0.0}); +} + + +TYPED_TEST(Dense, MovesToHybridAutomatically32) +{ + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); + + this->mtx4->move_to(hybrid_mtx.get()); + auto v = hybrid_mtx->get_const_coo_values(); + auto c = hybrid_mtx->get_const_coo_col_idxs(); + auto r = hybrid_mtx->get_const_coo_row_idxs(); + auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); + auto p = hybrid_mtx->get_ell_stride(); + + ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); + EXPECT_EQ(n, 0); + EXPECT_EQ(p, 2); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); +} + + +TYPED_TEST(Dense, ConvertsToHybridAutomatically32) +{ + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); + + this->mtx4->convert_to(hybrid_mtx.get()); + auto v = hybrid_mtx->get_const_coo_values(); + auto c = hybrid_mtx->get_const_coo_col_idxs(); + auto r = hybrid_mtx->get_const_coo_row_idxs(); + auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); + auto p = hybrid_mtx->get_ell_stride(); + + ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); + EXPECT_EQ(n, 0); + EXPECT_EQ(p, 2); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, MovesToHybridAutomatically) +TYPED_TEST(Dense, MovesToHybridAutomatically64) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor()); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - mtx4->move_to(hybrid_mtx.get()); + this->mtx4->move_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_coo_values(); auto c = hybrid_mtx->get_const_coo_col_idxs(); auto r = hybrid_mtx->get_const_coo_row_idxs(); @@ -599,18 +947,20 @@ TEST_F(Dense, MovesToHybridAutomatically) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, ConvertsToHybridAutomatically) +TYPED_TEST(Dense, ConvertsToHybridAutomatically64) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor()); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - mtx4->convert_to(hybrid_mtx.get()); + this->mtx4->convert_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_coo_values(); auto c = hybrid_mtx->get_const_coo_col_idxs(); auto r = hybrid_mtx->get_const_coo_row_idxs(); @@ -630,19 +980,21 @@ TEST_F(Dense, ConvertsToHybridAutomatically) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, MovesToHybridWithStrideAutomatically) +TYPED_TEST(Dense, MovesToHybridWithStrideAutomatically) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor(), - gko::dim<2>{}, 0, 3); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3); - mtx4->move_to(hybrid_mtx.get()); + this->mtx4->move_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_coo_values(); auto c = hybrid_mtx->get_const_coo_col_idxs(); auto r = hybrid_mtx->get_const_coo_row_idxs(); @@ -662,19 +1014,21 @@ TEST_F(Dense, MovesToHybridWithStrideAutomatically) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, ConvertsToHybridWithStrideAutomatically) +TYPED_TEST(Dense, ConvertsToHybridWithStrideAutomatically) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create(mtx4->get_executor(), - gko::dim<2>{}, 0, 3); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3); - mtx4->convert_to(hybrid_mtx.get()); + this->mtx4->convert_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_coo_values(); auto c = hybrid_mtx->get_const_coo_col_idxs(); auto r = hybrid_mtx->get_const_coo_row_idxs(); @@ -694,20 +1048,22 @@ TEST_F(Dense, ConvertsToHybridWithStrideAutomatically) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } -TEST_F(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) +TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create( - mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3, - std::make_shared::column_limit>(2)); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3, + std::make_shared(2)); - mtx4->move_to(hybrid_mtx.get()); + this->mtx4->move_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_ell_values(); auto c = hybrid_mtx->get_const_ell_col_idxs(); auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); @@ -724,15 +1080,15 @@ TEST_F(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) EXPECT_EQ(c[3], 1); EXPECT_EQ(c[4], 0); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 0.0); - EXPECT_EQ(v[3], 3.0); - EXPECT_EQ(v[4], 0.0); - EXPECT_EQ(v[5], 0.0); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], 2.0); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], 0.0); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{5.0}); + EXPECT_EQ(v[2], T{0.0}); + EXPECT_EQ(v[3], T{3.0}); + EXPECT_EQ(v[4], T{0.0}); + EXPECT_EQ(v[5], T{0.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], T{0.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], T{0.0}); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[1], 0); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[2], 0); @@ -742,13 +1098,15 @@ TEST_F(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) } -TEST_F(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) +TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create( - mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3, - std::make_shared::column_limit>(2)); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3, + std::make_shared(2)); - mtx4->convert_to(hybrid_mtx.get()); + this->mtx4->convert_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_ell_values(); auto c = hybrid_mtx->get_const_ell_col_idxs(); auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); @@ -765,15 +1123,15 @@ TEST_F(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) EXPECT_EQ(c[3], 1); EXPECT_EQ(c[4], 0); EXPECT_EQ(c[5], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 0.0); - EXPECT_EQ(v[3], 3.0); - EXPECT_EQ(v[4], 0.0); - EXPECT_EQ(v[5], 0.0); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], 2.0); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], 0.0); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{5.0}); + EXPECT_EQ(v[2], T{0.0}); + EXPECT_EQ(v[3], T{3.0}); + EXPECT_EQ(v[4], T{0.0}); + EXPECT_EQ(v[5], T{0.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], T{0.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], T{0.0}); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[1], 0); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[2], 0); @@ -783,13 +1141,15 @@ TEST_F(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) } -TEST_F(Dense, MovesToHybridWithStrideByPercent40) +TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create( - mtx4->get_executor(), gko::dim<2>{}, 0, 3, - std::make_shared::imbalance_limit>(0.4)); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, + std::make_shared(0.4)); - mtx4->move_to(hybrid_mtx.get()); + this->mtx4->move_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_ell_values(); auto c = hybrid_mtx->get_const_ell_col_idxs(); auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); @@ -805,12 +1165,12 @@ TEST_F(Dense, MovesToHybridWithStrideByPercent40) EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{5.0}); + EXPECT_EQ(v[2], T{0.0}); ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2); - EXPECT_EQ(coo_v[0], 3.0); - EXPECT_EQ(coo_v[1], 2.0); + EXPECT_EQ(coo_v[0], T{3.0}); + EXPECT_EQ(coo_v[1], T{2.0}); EXPECT_EQ(coo_c[0], 1); EXPECT_EQ(coo_c[1], 2); EXPECT_EQ(coo_r[0], 0); @@ -818,13 +1178,15 @@ TEST_F(Dense, MovesToHybridWithStrideByPercent40) } -TEST_F(Dense, ConvertsToHybridWithStrideByPercent40) +TYPED_TEST(Dense, ConvertsToHybridWithStrideByPercent40) { - auto hybrid_mtx = gko::matrix::Hybrid<>::create( - mtx4->get_executor(), gko::dim<2>{}, 0, 3, - std::make_shared::imbalance_limit>(0.4)); + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, + std::make_shared(0.4)); - mtx4->convert_to(hybrid_mtx.get()); + this->mtx4->convert_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_ell_values(); auto c = hybrid_mtx->get_const_ell_col_idxs(); auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); @@ -840,12 +1202,12 @@ TEST_F(Dense, ConvertsToHybridWithStrideByPercent40) EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 5.0); - EXPECT_EQ(v[2], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{5.0}); + EXPECT_EQ(v[2], T{0.0}); ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2); - EXPECT_EQ(coo_v[0], 3.0); - EXPECT_EQ(coo_v[1], 2.0); + EXPECT_EQ(coo_v[0], T{3.0}); + EXPECT_EQ(coo_v[1], T{2.0}); EXPECT_EQ(coo_c[0], 1); EXPECT_EQ(coo_c[1], 2); EXPECT_EQ(coo_r[0], 0); @@ -853,11 +1215,13 @@ TEST_F(Dense, ConvertsToHybridWithStrideByPercent40) } -TEST_F(Dense, ConvertsToSellp) +TYPED_TEST(Dense, ConvertsToSellp32) { - auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor()); + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); - mtx8->convert_to(sellp_mtx.get()); + this->mtx7->convert_to(sellp_mtx.get()); auto v = sellp_mtx->get_const_values(); auto c = sellp_mtx->get_const_col_idxs(); auto s = sellp_mtx->get_const_slice_sets(); @@ -876,23 +1240,25 @@ TEST_F(Dense, ConvertsToSellp) EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0); EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[gko::matrix::default_slice_size], 2.0); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], 0.0); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], 3.0); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); EXPECT_EQ(s[0], 0); EXPECT_EQ(s[1], 3); EXPECT_EQ(l[0], 3); } -TEST_F(Dense, MovesToSellp) +TYPED_TEST(Dense, MovesToSellp32) { - auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor()); + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); - mtx8->move_to(sellp_mtx.get()); + this->mtx7->move_to(sellp_mtx.get()); auto v = sellp_mtx->get_const_values(); auto c = sellp_mtx->get_const_col_idxs(); auto s = sellp_mtx->get_const_slice_sets(); @@ -911,24 +1277,100 @@ TEST_F(Dense, MovesToSellp) EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0); EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[gko::matrix::default_slice_size], 2.0); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], 0.0); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], 3.0); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); EXPECT_EQ(s[0], 0); EXPECT_EQ(s[1], 3); EXPECT_EQ(l[0], 3); } -TEST_F(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor) +TYPED_TEST(Dense, ConvertsToSellp64) { - auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor(), - gko::dim<2>{}, 2, 2, 0); + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); - mtx8->convert_to(sellp_mtx.get()); + this->mtx7->convert_to(sellp_mtx.get()); + auto v = sellp_mtx->get_const_values(); + auto c = sellp_mtx->get_const_col_idxs(); + auto s = sellp_mtx->get_const_slice_sets(); + auto l = sellp_mtx->get_const_slice_lengths(); + + ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sellp_mtx->get_total_cols(), 3); + ASSERT_EQ(sellp_mtx->get_num_stored_elements(), + 3 * gko::matrix::default_slice_size); + ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); + ASSERT_EQ(sellp_mtx->get_stride_factor(), + gko::matrix::default_stride_factor); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[gko::matrix::default_slice_size], 1); + EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(s[0], 0); + EXPECT_EQ(s[1], 3); + EXPECT_EQ(l[0], 3); +} + + +TYPED_TEST(Dense, MovesToSellp64) +{ + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); + + this->mtx7->move_to(sellp_mtx.get()); + auto v = sellp_mtx->get_const_values(); + auto c = sellp_mtx->get_const_col_idxs(); + auto s = sellp_mtx->get_const_slice_sets(); + auto l = sellp_mtx->get_const_slice_lengths(); + + ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sellp_mtx->get_total_cols(), 3); + ASSERT_EQ(sellp_mtx->get_num_stored_elements(), + 3 * gko::matrix::default_slice_size); + ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); + ASSERT_EQ(sellp_mtx->get_stride_factor(), + gko::matrix::default_stride_factor); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[gko::matrix::default_slice_size], 1); + EXPECT_EQ(c[gko::matrix::default_slice_size + 1], 0); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], 0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(s[0], 0); + EXPECT_EQ(s[1], 3); + EXPECT_EQ(l[0], 3); +} + + +TYPED_TEST(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor) +{ + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = + Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); + + this->mtx7->convert_to(sellp_mtx.get()); auto v = sellp_mtx->get_const_values(); auto c = sellp_mtx->get_const_col_idxs(); auto s = sellp_mtx->get_const_slice_sets(); @@ -947,26 +1389,28 @@ TEST_F(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor) EXPECT_EQ(c[5], 0); EXPECT_EQ(c[6], 0); EXPECT_EQ(c[7], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 0.0); - EXPECT_EQ(v[4], 3.0); - EXPECT_EQ(v[5], 0.0); - EXPECT_EQ(v[6], 0.0); - EXPECT_EQ(v[7], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{0.0}); + EXPECT_EQ(v[4], T{3.0}); + EXPECT_EQ(v[5], T{0.0}); + EXPECT_EQ(v[6], T{0.0}); + EXPECT_EQ(v[7], T{0.0}); EXPECT_EQ(s[0], 0); EXPECT_EQ(s[1], 4); EXPECT_EQ(l[0], 4); } -TEST_F(Dense, MovesToSellpWithSliceSizeAndStrideFactor) +TYPED_TEST(Dense, MovesToSellpWithSliceSizeAndStrideFactor) { - auto sellp_mtx = gko::matrix::Sellp<>::create(mtx8->get_executor(), - gko::dim<2>{}, 2, 2, 0); + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = + Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); - mtx8->move_to(sellp_mtx.get()); + this->mtx7->move_to(sellp_mtx.get()); auto v = sellp_mtx->get_const_values(); auto c = sellp_mtx->get_const_col_idxs(); auto s = sellp_mtx->get_const_slice_sets(); @@ -985,63 +1429,562 @@ TEST_F(Dense, MovesToSellpWithSliceSizeAndStrideFactor) EXPECT_EQ(c[5], 0); EXPECT_EQ(c[6], 0); EXPECT_EQ(c[7], 0); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 1.5); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 0.0); - EXPECT_EQ(v[4], 3.0); - EXPECT_EQ(v[5], 0.0); - EXPECT_EQ(v[6], 0.0); - EXPECT_EQ(v[7], 0.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{1.5}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{0.0}); + EXPECT_EQ(v[4], T{3.0}); + EXPECT_EQ(v[5], T{0.0}); + EXPECT_EQ(v[6], T{0.0}); + EXPECT_EQ(v[7], T{0.0}); EXPECT_EQ(s[0], 0); EXPECT_EQ(s[1], 4); EXPECT_EQ(l[0], 4); } -TEST_F(Dense, SquareMatrixIsTransposable) +TYPED_TEST(Dense, ConvertsToAndFromSellpWithMoreThanOneSlice) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using Sellp = typename gko::matrix::Sellp; + auto x = this->template gen_mtx(65, 25); + + auto sellp_mtx = Sellp::create(this->exec); + auto dense_mtx = Mtx::create(this->exec); + x->convert_to(sellp_mtx.get()); + sellp_mtx->convert_to(dense_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), x.get(), r::value); +} + + +TYPED_TEST(Dense, ConvertsEmptyToPrecision) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherDense = typename gko::matrix::Dense; + auto empty = OtherDense::create(this->exec); + auto res = Dense::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyToPrecision) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherDense = typename gko::matrix::Dense; + auto empty = OtherDense::create(this->exec); + auto res = Dense::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, ConvertsEmptyToCoo) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Coo = typename gko::matrix::Coo; + auto empty = Dense::create(this->exec); + auto res = Coo::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyToCoo) { - auto trans = mtx5->transpose(); - auto trans_as_dense = static_cast *>(trans.get()); + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Coo = typename gko::matrix::Coo; + auto empty = Dense::create(this->exec); + auto res = Coo::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, ConvertsEmptyMatrixToCsr) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Csr = typename gko::matrix::Csr; + auto empty = Dense::create(this->exec); + auto res = Csr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyMatrixToCsr) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Csr = typename gko::matrix::Csr; + auto empty = Dense::create(this->exec); + auto res = Csr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, ConvertsEmptyToSparsityCsr) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using SparsityCsr = typename gko::matrix::SparsityCsr; + auto empty = Dense::create(this->exec); + auto res = SparsityCsr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_nonzeros(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyToSparsityCsr) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using SparsityCsr = typename gko::matrix::SparsityCsr; + auto empty = Dense::create(this->exec); + auto res = SparsityCsr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_nonzeros(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, ConvertsEmptyToEll) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto empty = Dense::create(this->exec); + auto res = Ell::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyToEll) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Ell = typename gko::matrix::Ell; + auto empty = Dense::create(this->exec); + auto res = Ell::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, ConvertsEmptyToHybrid) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto empty = Dense::create(this->exec); + auto res = Hybrid::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyToHybrid) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Hybrid = typename gko::matrix::Hybrid; + auto empty = Dense::create(this->exec); + auto res = Hybrid::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, ConvertsEmptyToSellp) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto empty = Dense::create(this->exec); + auto res = Sellp::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, MovesEmptyToSellp) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using Sellp = typename gko::matrix::Sellp; + auto empty = Dense::create(this->exec); + auto res = Sellp::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Dense, SquareMatrixIsTransposable) +{ + using Mtx = typename TestFixture::Mtx; + auto trans = this->mtx5->transpose(); + auto trans_as_dense = static_cast(trans.get()); GKO_ASSERT_MTX_NEAR( trans_as_dense, - l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), 0.0); + l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), + r::value); } -TEST_F(Dense, NonSquareMatrixIsTransposable) +TYPED_TEST(Dense, NonSquareMatrixIsTransposable) { - auto trans = mtx4->transpose(); - auto trans_as_dense = static_cast *>(trans.get()); + using Mtx = typename TestFixture::Mtx; + auto trans = this->mtx4->transpose(); + auto trans_as_dense = static_cast(trans.get()); GKO_ASSERT_MTX_NEAR(trans_as_dense, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), - 0.0); + r::value); } -TEST_F(Dense, NonSquareMatrixIsConjugateTransposable) +TYPED_TEST(Dense, SquareMatrixIsRowPermutable) { - auto trans = mtx6->conj_transpose(); - auto trans_as_dense = - static_cast> *>(trans.get()); + // clang-format off + // {1.0, -1.0, -0.5}, + // {-2.0, 2.0, 4.5}, + // {2.1, 3.4, 1.2} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx5->get_executor(); + gko::Array permute_idxs{exec, {1, 2, 0}}; + auto row_permute = this->mtx5->row_permute(&permute_idxs); + + auto row_permute_dense = static_cast(row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(row_permute_dense, + l({{-2.0, 2.0, 4.5}, + {2.1, 3.4, 1.2}, + {1.0, -1.0, -0.5}}), r::value); + // clang-format on +} - GKO_ASSERT_MTX_NEAR(trans_as_dense, - l({{1.0 - 2.0 * i, -2.0 - 1.5 * i, 1.0 + 0.0 * i}, - {-1.0 - 2.1 * i, 4.5 + 0.0 * i, -i}}), - 0.0); + +TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array permute_idxs{exec, {1, 0}}; + auto row_permute = this->mtx4->row_permute(&permute_idxs); + + auto row_permute_dense = static_cast(row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(row_permute_dense, + l({{0.0, 5.0, 0.0}, + {1.0, 3.0, 2.0}}), r::value); + // clang-format on } -TEST_F(Dense, ConvertsToAndFromSellpWithMoreThanOneSlice) + +TYPED_TEST(Dense, SquareMatrixIsColPermutable) { - auto x = gen_mtx(65, 25); + // clang-format off + // {1.0, -1.0, -0.5}, + // {-2.0, 2.0, 4.5}, + // {2.1, 3.4, 1.2} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx5->get_executor(); + gko::Array permute_idxs{exec, {1, 2, 0}}; + auto c_permute = this->mtx5->column_permute(&permute_idxs); + + auto c_permute_dense = static_cast(c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(c_permute_dense, + l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), r::value); + // clang-format on +} + + +TYPED_TEST(Dense, NonSquareMatrixIsColPermutable) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array permute_idxs{exec, {1, 2, 0}}; + auto c_permute = this->mtx4->column_permute(&permute_idxs); + + auto c_permute_dense = static_cast(c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(c_permute_dense, + l({{3.0, 2.0, 1.0}, + {5.0, 0.0, 0.0}}), + r::value); + // clang-format on +} + + +TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutable) +{ + // clang-format off + // {1.0, -1.0, -0.5}, + // {-2.0, 2.0, 4.5}, + // {2.1, 3.4, 1.2} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx5->get_executor(); + gko::Array inverse_permute_idxs{exec, {1, 2, 0}}; + auto inverse_row_permute = + this->mtx5->inverse_row_permute(&inverse_permute_idxs); + + auto inverse_row_permute_dense = + static_cast(inverse_row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_row_permute_dense, + l({{2.1, 3.4, 1.2}, + {1.0, -1.0, -0.5}, + {-2.0, 2.0, 4.5}}), r::value); + // clang-format on +} + + +TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array inverse_permute_idxs{exec, {1, 0}}; + auto inverse_row_permute = + this->mtx4->inverse_row_permute(&inverse_permute_idxs); + + auto inverse_row_permute_dense = + static_cast(inverse_row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_row_permute_dense, + l({{0.0, 5.0, 0.0}, + {1.0, 3.0, 2.0}}), r::value); + // clang-format on +} + + +TYPED_TEST(Dense, SquareMatrixIsInverseColPermutable) +{ + // clang-format off + // {1.0, -1.0, -0.5}, + // {-2.0, 2.0, 4.5}, + // {2.1, 3.4, 1.2} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx5->get_executor(); + gko::Array inverse_permute_idxs{exec, {1, 2, 0}}; + auto inverse_c_permute = + this->mtx5->inverse_column_permute(&inverse_permute_idxs); + + auto inverse_c_permute_dense = static_cast(inverse_c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_c_permute_dense, + l({{-0.5, 1.0, -1.0}, + {4.5, -2.0, 2.0}, + {1.2, 2.1, 3.4}}), r::value); + // clang-format on +} + + +TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array inverse_permute_idxs{exec, {1, 2, 0}}; + auto inverse_c_permute = + this->mtx4->inverse_column_permute(&inverse_permute_idxs); + + auto inverse_c_permute_dense = static_cast(inverse_c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_c_permute_dense, + l({{2.0, 1.0, 3.0}, + {0.0, 0.0, 5.0}}), + r::value); + // clang-format on +} + + +TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable64) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array permute_idxs{exec, {1, 0}}; + auto row_permute = this->mtx4->row_permute(&permute_idxs); + + auto row_permute_dense = static_cast(row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(row_permute_dense, + l({{0.0, 5.0, 0.0}, + {1.0, 3.0, 2.0}}), r::value); + // clang-format on +} + + +TYPED_TEST(Dense, NonSquareMatrixIsColPermutable64) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array permute_idxs{exec, {1, 2, 0}}; + auto c_permute = this->mtx4->column_permute(&permute_idxs); + + auto c_permute_dense = static_cast(c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(c_permute_dense, + l({{3.0, 2.0, 1.0}, + {5.0, 0.0, 0.0}}), + r::value); + // clang-format on +} - auto sellp_mtx = gko::matrix::Sellp<>::create(exec); - auto dense_mtx = gko::matrix::Dense<>::create(exec); - x->convert_to(sellp_mtx.get()); - sellp_mtx->convert_to(dense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), x.get(), 1e-14); +TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable64) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array inverse_permute_idxs{exec, {1, 0}}; + auto inverse_row_permute = + this->mtx4->inverse_row_permute(&inverse_permute_idxs); + + auto inverse_row_permute_dense = + static_cast(inverse_row_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_row_permute_dense, + l({{0.0, 5.0, 0.0}, + {1.0, 3.0, 2.0}}), r::value); + // clang-format on +} + + +TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable64) +{ + // clang-format off + // {1.0, 3.0, 2.0}, + // {0.0, 5.0, 0.0} + // clang-format on + using Mtx = typename TestFixture::Mtx; + auto exec = this->mtx4->get_executor(); + gko::Array inverse_permute_idxs{exec, {1, 2, 0}}; + auto inverse_c_permute = + this->mtx4->inverse_column_permute(&inverse_permute_idxs); + + auto inverse_c_permute_dense = static_cast(inverse_c_permute.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(inverse_c_permute_dense, + l({{2.0, 1.0, 3.0}, + {0.0, 0.0, 5.0}}), + r::value); + // clang-format on +} + + +template +class DenseComplex : public ::testing::Test { +protected: + using value_type = T; + using Mtx = gko::matrix::Dense; +}; + + +TYPED_TEST_CASE(DenseComplex, gko::test::ComplexValueTypes); + + +TYPED_TEST(DenseComplex, NonSquareMatrixIsConjugateTransposable) +{ + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto exec = gko::ReferenceExecutor::create(); + auto mtx = gko::initialize({{T{1.0, 2.0}, T{-1.0, 2.1}}, + {T{-2.0, 1.5}, T{4.5, 0.0}}, + {T{1.0, 0.0}, T{0.0, 1.0}}}, + exec); + auto trans = mtx->conj_transpose(); + auto trans_as_dense = static_cast(trans.get()); + + GKO_ASSERT_MTX_NEAR(trans_as_dense, + l({{T{1.0, -2.0}, T{-2.0, -1.5}, T{1.0, 0.0}}, + {T{-1.0, -2.1}, T{4.5, 0.0}, T{0.0, -1.0}}}), + 0.0); } diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 806949caf9b..f9a4f401dc2 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -47,14 +46,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template class Ell : public ::testing::Test { protected: - using Mtx = gko::matrix::Ell<>; - using Csr = gko::matrix::Csr<>; - using Vec = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using T = value_type; + using Mtx = gko::matrix::Ell; + using Csr = gko::matrix::Csr; + using Vec = gko::matrix::Dense; Ell() : exec(gko::ReferenceExecutor::create()), @@ -85,10 +93,10 @@ class Ell : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } std::shared_ptr exec; @@ -96,107 +104,161 @@ class Ell : public ::testing::Test { std::unique_ptr mtx2; }; +TYPED_TEST_CASE(Ell, gko::test::ValueIndexTypes); + -TEST_F(Ell, AppliesToDenseVector) +TYPED_TEST(Ell, AppliesToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx1->apply(x.get(), y.get()); + this->mtx1->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Ell, AppliesToDenseMatrix) +TYPED_TEST(Ell, AppliesToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - mtx1->apply(x.get(), y.get()); + this->mtx1->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{13.0, 3.5}, - { 5.0, -7.5}}), 0.0); + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); // clang-format on } -TEST_F(Ell, AppliesLinearCombinationToDenseVector) +TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Ell, AppliesLinearCombinationToDenseMatrix) +TYPED_TEST(Ell, AppliesLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{-11.0, -2.5}, - { -1.0, 4.5}}), 0.0); + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); // clang-format on } -TEST_F(Ell, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension) +{ + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); +} + + +TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Ell, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}, 2); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Ell, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(Ell, ConvertsToPrecision) { - auto x = Vec::create(exec, gko::dim<2>{3}, 2); - auto y = Vec::create(exec, gko::dim<2>{2}); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Ell = typename TestFixture::Mtx; + using OtherEll = gko::matrix::Ell; + auto tmp = OtherEll::create(this->exec); + auto res = Ell::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); +} - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + +TYPED_TEST(Ell, MovesToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Ell = typename TestFixture::Mtx; + using OtherEll = gko::matrix::Ell; + auto tmp = OtherEll::create(this->exec); + auto res = Ell::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->move_to(tmp.get()); + tmp->move_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); } -TEST_F(Ell, ConvertsToDense) +TYPED_TEST(Ell, ConvertsToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx1->get_executor()); - mtx1->convert_to(dense_mtx.get()); + this->mtx1->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -206,11 +268,12 @@ TEST_F(Ell, ConvertsToDense) } -TEST_F(Ell, MovesToDense) +TYPED_TEST(Ell, MovesToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx1->get_executor()); - mtx1->move_to(dense_mtx.get()); + this->mtx1->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -220,111 +283,121 @@ TEST_F(Ell, MovesToDense) } -TEST_F(Ell, AppliesWithStrideToDenseVector) +TYPED_TEST(Ell, AppliesWithStrideToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx2->apply(x.get(), y.get()); + this->mtx2->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Ell, AppliesWithStrideToDenseMatrix) +TYPED_TEST(Ell, AppliesWithStrideToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - mtx2->apply(x.get(), y.get()); + this->mtx2->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{13.0, 3.5}, - {5.0, -7.5}}), 0.0); + l({{13.0, 3.5}, + {5.0, -7.5}}), 0.0); // clang-format on } -TEST_F(Ell, AppliesWithStrideLinearCombinationToDenseVector) +TYPED_TEST(Ell, AppliesWithStrideLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Ell, AppliesWithStrideLinearCombinationToDenseMatrix) +TYPED_TEST(Ell, AppliesWithStrideLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{-11.0, -2.5}, - {-1.0, 4.5}}), 0.0); + l({{-11.0, -2.5}, + {-1.0, 4.5}}), 0.0); // clang-format on } -TEST_F(Ell, ApplyWithStrideFailsOnWrongInnerDimension) +TYPED_TEST(Ell, ApplyWithStrideFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Ell, ApplyWithStrideFailsOnWrongNumberOfRows) +TYPED_TEST(Ell, ApplyWithStrideFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Ell, ApplyWithStrideFailsOnWrongNumberOfCols) +TYPED_TEST(Ell, ApplyWithStrideFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}, 2); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}, 2); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Ell, ConvertsWithStrideToDense) +TYPED_TEST(Ell, ConvertsWithStrideToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx2->get_executor()); // clang-format off - auto dense_other = gko::initialize>( + auto dense_other = gko::initialize( 4, {{1.0, 3.0, 2.0}, - {0.0, 5.0, 0.0}}, exec); + {0.0, 5.0, 0.0}}, this->exec); // clang-format on - mtx2->convert_to(dense_mtx.get()); + this->mtx2->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -334,11 +407,12 @@ TEST_F(Ell, ConvertsWithStrideToDense) } -TEST_F(Ell, MovesWithStrideToDense) +TYPED_TEST(Ell, MovesWithStrideToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx2->get_executor()); - mtx2->move_to(dense_mtx.get()); + this->mtx2->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -348,42 +422,179 @@ TEST_F(Ell, MovesWithStrideToDense) } -TEST_F(Ell, ConvertsToCsr) +TYPED_TEST(Ell, ConvertsToCsr) +{ + using Vec = typename TestFixture::Vec; + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge); + + this->mtx1->convert_to(csr_mtx_c.get()); + this->mtx1->convert_to(csr_mtx_m.get()); + + this->assert_equal_to_mtx(csr_mtx_c.get()); + this->assert_equal_to_mtx(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); +} + + +TYPED_TEST(Ell, MovesToCsr) +{ + using Vec = typename TestFixture::Vec; + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge); + + this->mtx1->move_to(csr_mtx_c.get()); + this->mtx1->move_to(csr_mtx_m.get()); + + this->assert_equal_to_mtx(csr_mtx_c.get()); + this->assert_equal_to_mtx(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); +} + + +TYPED_TEST(Ell, ConvertsWithStrideToCsr) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge); - - mtx1->convert_to(csr_mtx_c.get()); - mtx1->convert_to(csr_mtx_m.get()); - - assert_equal_to_mtx(csr_mtx_c.get()); - assert_equal_to_mtx(csr_mtx_m.get()); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + using Vec = typename TestFixture::Vec; + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx2->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx2->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx2->clone(); + + this->mtx2->convert_to(csr_mtx_c.get()); + mtx_clone->convert_to(csr_mtx_m.get()); + + this->assert_equal_to_mtx(csr_mtx_c.get()); + this->assert_equal_to_mtx(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Ell, ConvertsWithStrideToCsr) +TYPED_TEST(Ell, MovesWithStrideToCsr) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx2->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx2->get_executor(), csr_s_merge); - auto mtx_clone = mtx2->clone(); - - mtx2->move_to(csr_mtx_c.get()); + using Vec = typename TestFixture::Vec; + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx2->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx2->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx2->clone(); + + this->mtx2->move_to(csr_mtx_c.get()); mtx_clone->move_to(csr_mtx_m.get()); - assert_equal_to_mtx(csr_mtx_c.get()); - assert_equal_to_mtx(csr_mtx_m.get()); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + this->assert_equal_to_mtx(csr_mtx_c.get()); + this->assert_equal_to_mtx(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); +} + + +TYPED_TEST(Ell, ConvertsEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Ell = typename TestFixture::Mtx; + using OtherEll = gko::matrix::Ell; + auto empty = Ell::create(this->exec); + auto res = OtherEll::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Ell, MovesEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Ell = typename TestFixture::Mtx; + using OtherEll = gko::matrix::Ell; + auto empty = Ell::create(this->exec); + auto res = OtherEll::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Ell, ConvertsEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Ell = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Ell::create(this->exec); + auto res = Dense::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Ell, MovesEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Ell = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Ell::create(this->exec); + auto res = Dense::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Ell, ConvertsEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Ell = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto empty = Ell::create(this->exec); + auto res = Csr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Ell, MovesEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Ell = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto empty = Ell::create(this->exec); + auto res = Csr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); } diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 1ca6b61cd20..c1f98b67f4f 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/hybrid_kernels.hpp" +#include #include @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -47,14 +46,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/matrix/hybrid_kernels.hpp" +#include "core/test/utils.hpp" + + namespace { +template class Hybrid : public ::testing::Test { protected: - using Mtx = gko::matrix::Hybrid<>; - using Vec = gko::matrix::Dense<>; - using Csr = gko::matrix::Csr<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using T = value_type; + using Mtx = gko::matrix::Hybrid; + using Vec = gko::matrix::Dense; + using Csr = gko::matrix::Csr; Hybrid() : exec(gko::ReferenceExecutor::create()), @@ -109,10 +118,10 @@ class Hybrid : public ::testing::Test { EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], 1.0); - EXPECT_EQ(v[1], 3.0); - EXPECT_EQ(v[2], 2.0); - EXPECT_EQ(v[3], 5.0); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{2.0}); + EXPECT_EQ(v[3], T{5.0}); } std::shared_ptr exec; @@ -121,66 +130,74 @@ class Hybrid : public ::testing::Test { std::unique_ptr mtx3; }; +TYPED_TEST_CASE(Hybrid, gko::test::ValueIndexTypes); + -TEST_F(Hybrid, AppliesToDenseVector) +TYPED_TEST(Hybrid, AppliesToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx1->apply(x.get(), y.get()); + this->mtx1->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Hybrid, AppliesToDenseMatrix) +TYPED_TEST(Hybrid, AppliesToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - mtx1->apply(x.get(), y.get()); + this->mtx1->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{13.0, 3.5}, - { 5.0, -7.5}}), 0.0); + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); // clang-format on } -TEST_F(Hybrid, AppliesLinearCombinationToDenseVector) +TYPED_TEST(Hybrid, AppliesLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Hybrid, AppliesLinearCombinationToDenseMatrix) +TYPED_TEST(Hybrid, AppliesLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, @@ -190,38 +207,84 @@ TEST_F(Hybrid, AppliesLinearCombinationToDenseMatrix) } -TEST_F(Hybrid, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Hybrid, ApplyFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Hybrid, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(Hybrid, ApplyFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Hybrid, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(Hybrid, ApplyFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}, 2); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}, 2); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Hybrid, ConvertsToDense) +TYPED_TEST(Hybrid, ConvertsToPrecision) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor()); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Hybrid = typename TestFixture::Mtx; + using OtherHybrid = gko::matrix::Hybrid; + auto tmp = OtherHybrid::create(this->exec); + auto res = Hybrid::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); +} - mtx1->convert_to(dense_mtx.get()); + +TYPED_TEST(Hybrid, MovesToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Hybrid = typename TestFixture::Mtx; + using OtherHybrid = gko::matrix::Hybrid; + auto tmp = OtherHybrid::create(this->exec); + auto res = Hybrid::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->move_to(tmp.get()); + tmp->move_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); +} + + +TYPED_TEST(Hybrid, ConvertsToDense) +{ + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx1->get_executor()); + + this->mtx1->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -231,11 +294,12 @@ TEST_F(Hybrid, ConvertsToDense) } -TEST_F(Hybrid, MovesToDense) +TYPED_TEST(Hybrid, MovesToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx1->get_executor()); - mtx1->move_to(dense_mtx.get()); + this->mtx1->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -245,98 +309,199 @@ TEST_F(Hybrid, MovesToDense) } -TEST_F(Hybrid, ConvertsToCsr) +TYPED_TEST(Hybrid, ConvertsToCsr) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge); - - mtx1->convert_to(csr_mtx_c.get()); - mtx1->convert_to(csr_mtx_m.get()); - - assert_equal_to_mtx(csr_mtx_c.get()); - assert_equal_to_mtx(csr_mtx_m.get()); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge); + + this->mtx1->convert_to(csr_mtx_c.get()); + this->mtx1->convert_to(csr_mtx_m.get()); + + this->assert_equal_to_mtx(csr_mtx_c.get()); + this->assert_equal_to_mtx(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Hybrid, MovesToCsr) +TYPED_TEST(Hybrid, MovesToCsr) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge); - auto mtx_clone = mtx1->clone(); - - mtx1->move_to(csr_mtx_c.get()); + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx1->clone(); + + this->mtx1->move_to(csr_mtx_c.get()); mtx_clone->move_to(csr_mtx_m.get()); - assert_equal_to_mtx(csr_mtx_c.get()); - assert_equal_to_mtx(csr_mtx_m.get()); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + this->assert_equal_to_mtx(csr_mtx_c.get()); + this->assert_equal_to_mtx(csr_mtx_m.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); +} + + +TYPED_TEST(Hybrid, ConvertsToCsrWithoutZeros) +{ + using Csr = typename TestFixture::Csr; + auto csr_mtx = Csr::create(this->mtx3->get_executor()); + + this->mtx3->convert_to(csr_mtx.get()); + + this->assert_equal_to_mtx(csr_mtx.get()); +} + + +TYPED_TEST(Hybrid, MovesToCsrWithoutZeros) +{ + using Csr = typename TestFixture::Csr; + auto csr_mtx = Csr::create(this->mtx3->get_executor()); + + this->mtx3->move_to(csr_mtx.get()); + + this->assert_equal_to_mtx(csr_mtx.get()); } -TEST_F(Hybrid, ConvertsToCsrWithoutZeros) +TYPED_TEST(Hybrid, ConvertsEmptyToPrecision) { - auto csr_mtx = Csr::create(mtx3->get_executor()); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Hybrid = typename TestFixture::Mtx; + using OtherHybrid = gko::matrix::Hybrid; + auto other = Hybrid::create(this->exec); + auto res = OtherHybrid::create(this->exec); + + other->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); +} - mtx3->convert_to(csr_mtx.get()); - assert_equal_to_mtx(csr_mtx.get()); +TYPED_TEST(Hybrid, MovesEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Hybrid = typename TestFixture::Mtx; + using OtherHybrid = gko::matrix::Hybrid; + auto other = Hybrid::create(this->exec); + auto res = OtherHybrid::create(this->exec); + + other->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TEST_F(Hybrid, MovesToCsrWithoutZeros) +TYPED_TEST(Hybrid, ConvertsEmptyToDense) { - auto csr_mtx = Csr::create(mtx3->get_executor()); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Hybrid = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto other = Hybrid::create(this->exec); + auto res = Dense::create(this->exec); - mtx3->move_to(csr_mtx.get()); + other->convert_to(res.get()); - assert_equal_to_mtx(csr_mtx.get()); + ASSERT_FALSE(res->get_size()); } -TEST_F(Hybrid, CountsNonzeros) +TYPED_TEST(Hybrid, MovesEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Hybrid = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto other = Hybrid::create(this->exec); + auto res = Dense::create(this->exec); + + other->move_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Hybrid, ConvertsEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Hybrid = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto other = Hybrid::create(this->exec); + auto res = Csr::create(this->exec); + + other->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Hybrid, MovesEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Hybrid = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto other = Hybrid::create(this->exec); + auto res = Csr::create(this->exec); + + other->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Hybrid, CountsNonzeros) { gko::size_type nonzeros; - gko::kernels::reference::hybrid::count_nonzeros(exec, mtx1.get(), - &nonzeros); + gko::kernels::reference::hybrid::count_nonzeros( + this->exec, this->mtx1.get(), &nonzeros); ASSERT_EQ(nonzeros, 4); } -TEST_F(Hybrid, AppliesWithStrideToDenseVector) +TYPED_TEST(Hybrid, AppliesWithStrideToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx2->apply(x.get(), y.get()); + this->mtx2->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Hybrid, AppliesWithStrideToDenseMatrix) +TYPED_TEST(Hybrid, AppliesWithStrideToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - mtx2->apply(x.get(), y.get()); + this->mtx2->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, @@ -346,80 +511,87 @@ TEST_F(Hybrid, AppliesWithStrideToDenseMatrix) } -TEST_F(Hybrid, AppliesWithStrideLinearCombinationToDenseVector) +TYPED_TEST(Hybrid, AppliesWithStrideLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Hybrid, AppliesWithStrideLinearCombinationToDenseMatrix) +TYPED_TEST(Hybrid, AppliesWithStrideLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{-11.0, -2.5}, - {-1.0, 4.5}}), 0.0); + l({{-11.0, -2.5}, + {-1.0, 4.5}}), 0.0); // clang-format on } -TEST_F(Hybrid, ApplyWithStrideFailsOnWrongInnerDimension) +TYPED_TEST(Hybrid, ApplyWithStrideFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Hybrid, ApplyWithStrideFailsOnWrongNumberOfRows) +TYPED_TEST(Hybrid, ApplyWithStrideFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Hybrid, ApplyWithStrideFailsOnWrongNumberOfCols) +TYPED_TEST(Hybrid, ApplyWithStrideFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}, 2); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}, 2); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Hybrid, ConvertsWithStrideToDense) +TYPED_TEST(Hybrid, ConvertsWithStrideToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx2->get_executor()); // clang-format off - auto dense_other = gko::initialize>( + auto dense_other = gko::initialize( 4, {{1.0, 3.0, 2.0}, - {0.0, 5.0, 0.0}}, exec); + {0.0, 5.0, 0.0}}, this->exec); // clang-format on - mtx2->convert_to(dense_mtx.get()); + this->mtx2->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -429,11 +601,12 @@ TEST_F(Hybrid, ConvertsWithStrideToDense) } -TEST_F(Hybrid, MovesWithStrideToDense) +TYPED_TEST(Hybrid, MovesWithStrideToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx2->get_executor()); - mtx2->move_to(dense_mtx.get()); + this->mtx2->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp index b725b83f83b..0ade3bce9f1 100644 --- a/reference/test/matrix/identity.cpp +++ b/reference/test/matrix/identity.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,17 +36,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include "core/test/utils.hpp" + + namespace { +template class Identity : public ::testing::Test { protected: - using Id = gko::matrix::Identity<>; - using Vec = gko::matrix::Dense<>; + using value_type = T; + using Id = gko::matrix::Identity; + using Vec = gko::matrix::Dense; Identity() : exec(gko::ReferenceExecutor::create()) {} @@ -54,13 +58,18 @@ class Identity : public ::testing::Test { }; -TEST_F(Identity, AppliesLinearCombinationToVector) +TYPED_TEST_CASE(Identity, gko::test::ValueTypes); + + +TYPED_TEST(Identity, AppliesLinearCombinationToVector) { - auto identity = Id::create(exec, 3); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({1.0}, exec); - auto x = gko::initialize({3.0, -1.0, 2.0}, exec); - auto b = gko::initialize({2.0, 1.0, 5.0}, exec); + using Id = typename TestFixture::Id; + using Vec = typename TestFixture::Vec; + auto identity = Id::create(this->exec, 3); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({1.0}, this->exec); + auto x = gko::initialize({3.0, -1.0, 2.0}, this->exec); + auto b = gko::initialize({2.0, 1.0, 5.0}, this->exec); identity->apply(alpha.get(), b.get(), beta.get(), x.get()); @@ -68,19 +77,22 @@ TEST_F(Identity, AppliesLinearCombinationToVector) } -TEST_F(Identity, AppliesLinearCombinationToMultipleVectors) +TYPED_TEST(Identity, AppliesLinearCombinationToMultipleVectors) { - auto identity = Id::create(exec, 3); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({1.0}, exec); - auto x = - gko::initialize(3, {{3.0, 0.5}, {-1.0, 2.5}, {2.0, 3.4}}, exec); - auto b = - gko::initialize(3, {{2.0, 3.0}, {1.0, 2.0}, {5.0, -1.0}}, exec); + using Id = typename TestFixture::Id; + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto identity = Id::create(this->exec, 3); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({1.0}, this->exec); + auto x = gko::initialize( + 3, {I{3.0, 0.5}, I{-1.0, 2.5}, I{2.0, 3.5}}, this->exec); + auto b = gko::initialize( + 3, {I{2.0, 3.0}, I{1.0, 2.0}, I{5.0, -1.0}}, this->exec); identity->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{7.0, 6.5}, {1.0, 6.5}, {12.0, 1.4}}), 0.0); + GKO_ASSERT_MTX_NEAR(x, l({{7.0, 6.5}, {1.0, 6.5}, {12.0, 1.5}}), 0.0); } diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp new file mode 100644 index 00000000000..1c7e93fe115 --- /dev/null +++ b/reference/test/matrix/permutation.cpp @@ -0,0 +1,499 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class Permutation : public ::testing::Test { +protected: + using v_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using i_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Vec = gko::matrix::Dense; + using Csr = gko::matrix::Csr; + + Permutation() : exec(gko::ReferenceExecutor::create()) {} + + std::shared_ptr exec; +}; + +TYPED_TEST_CASE(Permutation, gko::test::ValueIndexTypes); + + +TYPED_TEST(Permutation, AppliesRowPermutationToDense) +{ + using i_type = typename TestFixture::i_type; + using T = typename TestFixture::v_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec::create(this->exec, gko::dim<2>{2}); + i_type rdata[] = {1, 0}; + + auto perm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2}, + gko::Array::view(this->exec, 2, rdata)); + + perm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{4.0, 2.5}, + {2.0, 3.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesColPermutationToDense) +{ + using i_type = typename TestFixture::i_type; + using T = typename TestFixture::v_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec::create(this->exec, gko::dim<2>{2}); + i_type rdata[] = {1, 0}; + + auto perm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2}, + gko::Array::view(this->exec, 2, rdata), + gko::matrix::column_permute); + + perm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{3.0, 2.0}, + {2.5, 4.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense) +{ + using i_type = typename TestFixture::i_type; + using T = typename TestFixture::v_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y1 = Vec::create(this->exec, gko::dim<2>{2}); + auto y2 = Vec::create(this->exec, gko::dim<2>{2}); + i_type cdata[] = {1, 0}; + i_type rdata[] = {1, 0}; + + auto rperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2}, + gko::Array::view(this->exec, 2, rdata)); + auto cperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2}, + gko::Array::view(this->exec, 2, cdata), + gko::matrix::column_permute); + + rperm->apply(x.get(), y1.get()); + cperm->apply(y1.get(), y2.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y2.get(), + l({{2.5, 4.0}, + {3.0, 2.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray) +{ + using i_type = typename TestFixture::i_type; + using T = typename TestFixture::v_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y1 = Vec::create(this->exec, gko::dim<2>{2}); + i_type data[] = {1, 0}; + + auto perm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{2}, + gko::Array::view(this->exec, 2, data), + gko::matrix::row_permute | gko::matrix::column_permute); + + perm->apply(x.get(), y1.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y1.get(), + l({{2.5, 4.0}, + {3.0, 2.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense) +{ + using i_type = typename TestFixture::i_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y1 = Vec::create(this->exec, gko::dim<2>{3}); + auto y2 = Vec::create(this->exec, gko::dim<2>{3}); + i_type cdata[] = {1, 2, 0}; + i_type rdata[] = {1, 2, 0}; + + auto rperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, rdata), + gko::matrix::row_permute | gko::matrix::inverse_permute); + auto cperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, cdata), + gko::matrix::inverse_permute | gko::matrix::column_permute); + + rperm->apply(x.get(), y1.get()); + cperm->apply(y1.get(), y2.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y2.get(), + l({{2.5, 0.0, 4.0}, + {0.0, 2.0, 3.0}, + {0.0, 0.0, 1.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray) +{ + using i_type = typename TestFixture::i_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y1 = Vec::create(this->exec, gko::dim<2>{3}); + i_type data[] = {1, 2, 0}; + + auto perm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, data), + gko::matrix::column_permute | gko::matrix::row_permute | + gko::matrix::inverse_permute); + + perm->apply(x.get(), y1.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y1.get(), + l({{2.5, 0.0, 4.0}, + {0.0, 2.0, 3.0}, + {0.0, 0.0, 1.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense) +{ + using i_type = typename TestFixture::i_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y = Vec::create(this->exec, gko::dim<2>{3}); + i_type rdata[] = {1, 2, 0}; + + auto rperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, rdata), + gko::matrix::row_permute | gko::matrix::inverse_permute); + + rperm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{0.0, 4.0, 2.5}, + {2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseColPermutationToDense) +{ + using i_type = typename TestFixture::i_type; + using Vec = typename TestFixture::Vec; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y = Vec::create(this->exec, gko::dim<2>{3}); + i_type cdata[] = {1, 2, 0}; + + auto cperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, cdata), + gko::matrix::inverse_permute | gko::matrix::column_permute); + + cperm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{0.0, 2.0, 3.0}, + {0.0, 0.0, 1.0}, + {2.5, 0.0, 4.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesRowPermutationToCsr) +{ + using i_type = typename TestFixture::i_type; + using Csr = typename TestFixture::Csr; + // clang-format off + auto x = gko::initialize( + {{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y = Csr::create(this->exec, gko::dim<2>{3}); + i_type rdata[] = {1, 2, 0}; + + auto perm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, rdata)); + + perm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}, + {2.0, 3.0, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesColPermutationToCsr) +{ + using i_type = typename TestFixture::i_type; + using Csr = typename TestFixture::Csr; + // clang-format off + auto x = gko::initialize( + {{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y = Csr::create(this->exec, gko::dim<2>{3}); + i_type cdata[] = {1, 2, 0}; + + auto perm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, cdata), + gko::matrix::column_permute); + + perm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{3.0, 0.0, 2.0}, + {1.0, 0.0, 0.0}, + {4.0, 2.5, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr) +{ + using i_type = typename TestFixture::i_type; + using Csr = typename TestFixture::Csr; + // clang-format off + auto x = gko::initialize( + {{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y1 = Csr::create(this->exec, gko::dim<2>{3}); + auto y2 = Csr::create(this->exec, gko::dim<2>{3}); + i_type cdata[] = {1, 2, 0}; + i_type rdata[] = {1, 2, 0}; + + auto rperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, rdata)); + auto cperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, cdata), + gko::matrix::column_permute); + + rperm->apply(x.get(), y1.get()); + cperm->apply(y1.get(), y2.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y2.get(), + l({{1.0, 0.0, 0.0}, + {4.0, 2.5, 0.0}, + {3.0, 0.0, 2.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr) +{ + using i_type = typename TestFixture::i_type; + using Csr = typename TestFixture::Csr; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y = Csr::create(this->exec, gko::dim<2>{3}); + i_type rdata[] = {1, 2, 0}; + + auto rperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, rdata), + gko::matrix::row_permute | gko::matrix::inverse_permute); + + rperm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{0.0, 4.0, 2.5}, + {2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr) +{ + using i_type = typename TestFixture::i_type; + using Csr = typename TestFixture::Csr; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y = Csr::create(this->exec, gko::dim<2>{3}); + i_type cdata[] = {1, 2, 0}; + + auto cperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, cdata), + gko::matrix::inverse_permute | gko::matrix::column_permute); + + cperm->apply(x.get(), y.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y.get(), + l({{0.0, 2.0, 3.0}, + {0.0, 0.0, 1.0}, + {2.5, 0.0, 4.0}}), + 0.0); + // clang-format on +} + + +TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr) +{ + using i_type = typename TestFixture::i_type; + using Csr = typename TestFixture::Csr; + // clang-format off + auto x = gko::initialize({{2.0, 3.0, 0.0}, + {0.0, 1.0, 0.0}, + {0.0, 4.0, 2.5}}, + this->exec); + // clang-format on + auto y1 = Csr::create(this->exec, gko::dim<2>{3}); + auto y2 = Csr::create(this->exec, gko::dim<2>{3}); + i_type cdata[] = {1, 2, 0}; + i_type rdata[] = {1, 2, 0}; + + auto rperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, rdata), + gko::matrix::row_permute | gko::matrix::inverse_permute); + auto cperm = gko::matrix::Permutation::create( + this->exec, gko::dim<2>{3}, + gko::Array::view(this->exec, 3, cdata), + gko::matrix::inverse_permute | gko::matrix::column_permute); + + rperm->apply(x.get(), y1.get()); + cperm->apply(y1.get(), y2.get()); + // clang-format off + GKO_ASSERT_MTX_NEAR(y2.get(), + l({{2.5, 0.0, 4.0}, + {0.0, 2.0, 3.0}, + {0.0, 0.0, 1.0}}), + 0.0); + // clang-format on +} + + +} // namespace diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index 102e218dbb4..ba28e1d127c 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,7 +36,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -45,15 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/sellp_kernels.hpp" +#include "core/test/utils.hpp" namespace { +template class Sellp : public ::testing::Test { protected: - using Mtx = gko::matrix::Sellp<>; - using Vec = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Sellp; + using Csr = gko::matrix::Csr; + using Vec = gko::matrix::Dense; Sellp() : exec(gko::ReferenceExecutor::create()), @@ -74,29 +80,34 @@ class Sellp : public ::testing::Test { std::unique_ptr mtx2; }; +TYPED_TEST_CASE(Sellp, gko::test::ValueIndexTypes); -TEST_F(Sellp, AppliesToDenseVector) + +TYPED_TEST(Sellp, AppliesToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx1->apply(x.get(), y.get()); + this->mtx1->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Sellp, AppliesToDenseMatrix) +TYPED_TEST(Sellp, AppliesToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - mtx1->apply(x.get(), y.get()); + this->mtx1->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, @@ -106,34 +117,37 @@ TEST_F(Sellp, AppliesToDenseMatrix) } -TEST_F(Sellp, AppliesLinearCombinationToDenseVector) +TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Sellp, AppliesLinearCombinationToDenseMatrix) +TYPED_TEST(Sellp, AppliesLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, @@ -143,38 +157,84 @@ TEST_F(Sellp, AppliesLinearCombinationToDenseMatrix) } -TEST_F(Sellp, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Sellp, ApplyFailsOnWrongInnerDimension) +{ + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); +} + + +TYPED_TEST(Sellp, ApplyFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Sellp, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(Sellp, ApplyFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}, 2); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Sellp, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(Sellp, ConvertsToPrecision) { - auto x = Vec::create(exec, gko::dim<2>{3}, 2); - auto y = Vec::create(exec, gko::dim<2>{2}); + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Sellp = typename TestFixture::Mtx; + using OtherSellp = gko::matrix::Sellp; + auto tmp = OtherSellp::create(this->exec); + auto res = Sellp::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); +} + - ASSERT_THROW(mtx1->apply(x.get(), y.get()), gko::DimensionMismatch); +TYPED_TEST(Sellp, MovesToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Sellp = typename TestFixture::Mtx; + using OtherSellp = gko::matrix::Sellp; + auto tmp = OtherSellp::create(this->exec); + auto res = Sellp::create(this->exec); + // If OtherType is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx1->move_to(tmp.get()); + tmp->move_to(res.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx1, res, residual); } -TEST_F(Sellp, ConvertsToDense) +TYPED_TEST(Sellp, ConvertsToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx1->get_executor()); - mtx1->convert_to(dense_mtx.get()); + this->mtx1->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -184,11 +244,12 @@ TEST_F(Sellp, ConvertsToDense) } -TEST_F(Sellp, MovesToDense) +TYPED_TEST(Sellp, MovesToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx1->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx1->get_executor()); - mtx1->move_to(dense_mtx.get()); + this->mtx1->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -198,17 +259,16 @@ TEST_F(Sellp, MovesToDense) } -TEST_F(Sellp, ConvertsToCsr) +TYPED_TEST(Sellp, ConvertsToCsr) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge); + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge); - mtx1->convert_to(csr_mtx_c.get()); - mtx1->convert_to(csr_mtx_m.get()); + this->mtx1->convert_to(csr_mtx_c.get()); + this->mtx1->convert_to(csr_mtx_m.get()); // clang-format off GKO_ASSERT_MTX_NEAR(csr_mtx_c, @@ -216,22 +276,21 @@ TEST_F(Sellp, ConvertsToCsr) {0.0, 5.0, 0.0}}), 0.0); // clang-format on GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TEST_F(Sellp, MovesToCsr) +TYPED_TEST(Sellp, MovesToCsr) { - auto csr_s_classical = std::make_shared::classical>(); - auto csr_s_merge = std::make_shared::merge_path>(); - auto csr_mtx_c = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical); - auto csr_mtx_m = - gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge); - auto mtx_clone = mtx1->clone(); - - mtx1->move_to(csr_mtx_c.get()); + using Csr = typename TestFixture::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx1->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx1->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx1->clone(); + + this->mtx1->move_to(csr_mtx_c.get()); mtx_clone->move_to(csr_mtx_m.get()); // clang-format off @@ -240,33 +299,138 @@ TEST_F(Sellp, MovesToCsr) {0.0, 5.0, 0.0}}), 0.0); // clang-format on GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0); - ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical); - ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); +} + + +TYPED_TEST(Sellp, ConvertsEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Sellp = typename TestFixture::Mtx; + using OtherSellp = gko::matrix::Sellp; + auto empty = OtherSellp::create(this->exec); + empty->get_slice_sets()[0] = 0; + auto res = Sellp::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Sellp, MovesEmptyToPrecision) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using OtherType = typename gko::next_precision; + using Sellp = typename TestFixture::Mtx; + using OtherSellp = gko::matrix::Sellp; + auto empty = OtherSellp::create(this->exec); + empty->get_slice_sets()[0] = 0; + auto res = Sellp::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Sellp, ConvertsEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Sellp = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Sellp::create(this->exec); + auto res = Dense::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Sellp, MovesEmptyToDense) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Sellp = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + auto empty = Sellp::create(this->exec); + auto res = Dense::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Sellp, ConvertsEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Sellp = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto empty = Sellp::create(this->exec); + auto res = Csr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); +} + + +TYPED_TEST(Sellp, MovesEmptyToCsr) +{ + using ValueType = typename TestFixture::value_type; + using IndexType = typename TestFixture::index_type; + using Sellp = typename TestFixture::Mtx; + using Csr = gko::matrix::Csr; + auto empty = Sellp::create(this->exec); + auto res = Csr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); } -TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseVector) +TYPED_TEST(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx2->apply(x.get(), y.get()); + this->mtx2->apply(x.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); } -TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseMatrix) +TYPED_TEST(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseMatrix) { + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); // clang-format on - auto y = Vec::create(exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - mtx2->apply(x.get(), y.get()); + this->mtx2->apply(x.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, @@ -276,34 +440,39 @@ TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorToDenseMatrix) } -TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseVector) +TYPED_TEST(Sellp, + AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); } -TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseMatrix) +TYPED_TEST(Sellp, + AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); // clang-format off auto x = gko::initialize( - {{2.0, 3.0}, - {1.0, -1.5}, - {4.0, 2.5}}, exec); + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); auto y = gko::initialize( - {{1.0, 0.5}, - {2.0, -1.5}}, exec); + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); // clang-format on - mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); // clang-format off GKO_ASSERT_MTX_NEAR(y, @@ -313,43 +482,47 @@ TEST_F(Sellp, AppliesWithSliceSizeAndStrideFactorLinearCombinationToDenseMatrix) } -TEST_F(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongInnerDimension) +TYPED_TEST(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfRows) +TYPED_TEST(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfCols) +TYPED_TEST(Sellp, ApplyWithSliceSizeAndStrideFactorFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}, 2); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}, 2); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx2->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToDense) +TYPED_TEST(Sellp, ConvertsWithSliceSizeAndStrideFactorToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx2->get_executor()); // clang-format off - auto dense_other = gko::initialize>( + auto dense_other = gko::initialize( 4, {{1.0, 3.0, 2.0}, - {0.0, 5.0, 0.0}}, exec); + {0.0, 5.0, 0.0}}, this->exec); // clang-format on - mtx2->convert_to(dense_mtx.get()); + this->mtx2->convert_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -359,11 +532,12 @@ TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToDense) } -TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToDense) +TYPED_TEST(Sellp, MovesWithSliceSizeAndStrideFactorToDense) { - auto dense_mtx = gko::matrix::Dense<>::create(mtx2->get_executor()); + using Vec = typename TestFixture::Vec; + auto dense_mtx = Vec::create(this->mtx2->get_executor()); - mtx2->move_to(dense_mtx.get()); + this->mtx2->move_to(dense_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(dense_mtx, @@ -373,11 +547,12 @@ TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToDense) } -TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToCsr) +TYPED_TEST(Sellp, ConvertsWithSliceSizeAndStrideFactorToCsr) { - auto csr_mtx = gko::matrix::Csr<>::create(mtx2->get_executor()); + using Csr = typename TestFixture::Csr; + auto csr_mtx = Csr::create(this->mtx2->get_executor()); - mtx2->convert_to(csr_mtx.get()); + this->mtx2->convert_to(csr_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(csr_mtx, @@ -387,11 +562,12 @@ TEST_F(Sellp, ConvertsWithSliceSizeAndStrideFactorToCsr) } -TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToCsr) +TYPED_TEST(Sellp, MovesWithSliceSizeAndStrideFactorToCsr) { - auto csr_mtx = gko::matrix::Csr<>::create(mtx2->get_executor()); + using Csr = typename TestFixture::Csr; + auto csr_mtx = Csr::create(this->mtx2->get_executor()); - mtx2->move_to(csr_mtx.get()); + this->mtx2->move_to(csr_mtx.get()); // clang-format off GKO_ASSERT_MTX_NEAR(csr_mtx, @@ -401,11 +577,12 @@ TEST_F(Sellp, MovesWithSliceSizeAndStrideFactorToCsr) } -TEST_F(Sellp, CountsNonzeros) +TYPED_TEST(Sellp, CountsNonzeros) { gko::size_type nonzeros; - gko::kernels::reference::sellp::count_nonzeros(exec, mtx1.get(), &nonzeros); + gko::kernels::reference::sellp::count_nonzeros(this->exec, this->mtx1.get(), + &nonzeros); ASSERT_EQ(nonzeros, 4); } diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp index f20e8ba3134..69c8f580ffd 100644 --- a/reference/test/matrix/sparsity_csr.cpp +++ b/reference/test/matrix/sparsity_csr.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -43,16 +43,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class SparsityCsr : public ::testing::Test { protected: - using v_type = double; - using i_type = int; + using v_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using i_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; using Mtx = gko::matrix::SparsityCsr; using Csr = gko::matrix::Csr; using DenseMtx = gko::matrix::Dense; @@ -61,8 +64,8 @@ class SparsityCsr : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4)) { - Mtx::index_type *c = mtx->get_col_idxs(); - Mtx::index_type *r = mtx->get_row_ptrs(); + i_type *c = mtx->get_col_idxs(); + i_type *r = mtx->get_row_ptrs(); r[0] = 0; r[1] = 3; r[2] = 4; @@ -76,28 +79,35 @@ class SparsityCsr : public ::testing::Test { std::unique_ptr mtx; }; +TYPED_TEST_CASE(SparsityCsr, gko::test::ValueIndexTypes); -TEST_F(SparsityCsr, CanBeCreatedFromExistingCsrMatrix) + +TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingCsrMatrix) { + using Csr = typename TestFixture::Csr; + using DenseMtx = typename TestFixture::DenseMtx; + using Mtx = typename TestFixture::Mtx; auto csr_mtx = gko::initialize( - {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, exec); + {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, this->exec); auto comp_mtx = gko::initialize( - {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, exec); + {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, this->exec); - auto mtx = Mtx::create(exec, std::move(csr_mtx)); + auto mtx = Mtx::create(this->exec, std::move(csr_mtx)); GKO_ASSERT_MTX_NEAR(comp_mtx.get(), mtx.get(), 0.0); } -TEST_F(SparsityCsr, CanBeCreatedFromExistingDenseMatrix) +TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingDenseMatrix) { + using DenseMtx = typename TestFixture::DenseMtx; + using Mtx = typename TestFixture::Mtx; auto dense_mtx = gko::initialize( - {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, exec); + {{2.0, 3.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, -3.0}}, this->exec); auto comp_mtx = gko::initialize( - {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, exec); + {{1.0, 1.0, 0.0}, {0.0, 1.0, 1.0}, {0.0, 0.0, 1.0}}, this->exec); - auto mtx = Mtx::create(exec, std::move(dense_mtx)); + auto mtx = Mtx::create(this->exec, std::move(dense_mtx)); GKO_ASSERT_MTX_NEAR(comp_mtx.get(), mtx.get(), 0.0); } diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index e63bca5453e..7c40ceb41bc 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/sparsity_csr_kernels.hpp" +#include #include @@ -43,19 +43,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include -#include "core/test/utils/assertions.hpp" +#include "core/matrix/sparsity_csr_kernels.hpp" +#include "core/test/utils.hpp" namespace { +template class SparsityCsr : public ::testing::Test { protected: - using Mtx = gko::matrix::SparsityCsr<>; - using Vec = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Csr = gko::matrix::Csr; + using Mtx = gko::matrix::SparsityCsr; + using Vec = gko::matrix::Dense; SparsityCsr() : exec(gko::ReferenceExecutor::create()), @@ -71,8 +77,8 @@ class SparsityCsr : public ::testing::Test { void create_mtx(Mtx *m) { - Mtx::index_type *c = m->get_col_idxs(); - Mtx::index_type *r = m->get_row_ptrs(); + index_type *c = m->get_col_idxs(); + index_type *r = m->get_row_ptrs(); /* * 1 1 1 * 0 1 0 @@ -88,8 +94,8 @@ class SparsityCsr : public ::testing::Test { void create_mtx2(Mtx *m) { - Mtx::index_type *c = m->get_col_idxs(); - Mtx::index_type *r = m->get_row_ptrs(); + index_type *c = m->get_col_idxs(); + index_type *r = m->get_row_ptrs(); // It keeps an explict zero /* * 1 1 1 @@ -151,102 +157,118 @@ class SparsityCsr : public ::testing::Test { std::unique_ptr mtx3_unsorted; }; +TYPED_TEST_CASE(SparsityCsr, gko::test::ValueIndexTypes); + -TEST_F(SparsityCsr, AppliesToDenseVector) +TYPED_TEST(SparsityCsr, AppliesToDenseVector) { - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = Vec::create(exec, gko::dim<2>{2, 1}); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); - mtx->apply(x.get(), y.get()); + this->mtx->apply(x.get(), y.get()); - EXPECT_EQ(y->at(0), 7.0); - EXPECT_EQ(y->at(1), 1.0); + EXPECT_EQ(y->at(0), T{7.0}); + EXPECT_EQ(y->at(1), T{1.0}); } -TEST_F(SparsityCsr, AppliesToDenseMatrix) +TYPED_TEST(SparsityCsr, AppliesToDenseMatrix) { - auto x = gko::initialize({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec); - auto y = Vec::create(exec, gko::dim<2>{2}); - - mtx->apply(x.get(), y.get()); - - EXPECT_EQ(y->at(0, 0), 7.0); - EXPECT_EQ(y->at(1, 0), 1.0); - EXPECT_EQ(y->at(0, 1), 4.0); - EXPECT_EQ(y->at(1, 1), -1.5); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto x = gko::initialize( + {I{2.0, 3.0}, I{1.0, -1.5}, I{4.0, 2.5}}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + this->mtx->apply(x.get(), y.get()); + + EXPECT_EQ(y->at(0, 0), T{7.0}); + EXPECT_EQ(y->at(1, 0), T{1.0}); + EXPECT_EQ(y->at(0, 1), T{4.0}); + EXPECT_EQ(y->at(1, 1), T{-1.5}); } -TEST_F(SparsityCsr, AppliesLinearCombinationToDenseVector) +TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, exec); - auto y = gko::initialize({1.0, 2.0}, exec); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); - mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); + this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); - EXPECT_EQ(y->at(0), -5.0); - EXPECT_EQ(y->at(1), 3.0); + EXPECT_EQ(y->at(0), T{-5.0}); + EXPECT_EQ(y->at(1), T{3.0}); } -TEST_F(SparsityCsr, AppliesLinearCombinationToDenseMatrix) +TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseMatrix) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - auto x = gko::initialize({{2.0, 3.0}, {1.0, -1.5}, {4.0, 2.5}}, exec); - auto y = gko::initialize({{1.0, 0.5}, {2.0, -1.5}}, exec); - - mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); - - EXPECT_EQ(y->at(0, 0), -5.0); - EXPECT_EQ(y->at(1, 0), 3.0); - EXPECT_EQ(y->at(0, 1), -3.0); - EXPECT_EQ(y->at(1, 1), -1.5); + using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize( + {I{2.0, 3.0}, I{1.0, -1.5}, I{4.0, 2.5}}, this->exec); + auto y = + gko::initialize({I{1.0, 0.5}, I{2.0, -1.5}}, this->exec); + + this->mtx->apply(alpha.get(), x.get(), beta.get(), y.get()); + + EXPECT_EQ(y->at(0, 0), T{-5.0}); + EXPECT_EQ(y->at(1, 0), T{3.0}); + EXPECT_EQ(y->at(0, 1), T{-3.0}); + EXPECT_EQ(y->at(1, 1), T{-1.5}); } -TEST_F(SparsityCsr, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(SparsityCsr, ApplyFailsOnWrongInnerDimension) { - auto x = Vec::create(exec, gko::dim<2>{2}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{2}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(SparsityCsr, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(SparsityCsr, ApplyFailsOnWrongNumberOfRows) { - auto x = Vec::create(exec, gko::dim<2>{3, 2}); - auto y = Vec::create(exec, gko::dim<2>{3, 2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3, 2}); + auto y = Vec::create(this->exec, gko::dim<2>{3, 2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(SparsityCsr, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(SparsityCsr, ApplyFailsOnWrongNumberOfCols) { - auto x = Vec::create(exec, gko::dim<2>{3}); - auto y = Vec::create(exec, gko::dim<2>{2}); + using Vec = typename TestFixture::Vec; + auto x = Vec::create(this->exec, gko::dim<2>{3}); + auto y = Vec::create(this->exec, gko::dim<2>{2}); - ASSERT_THROW(mtx->apply(x.get(), y.get()), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->apply(x.get(), y.get()), gko::DimensionMismatch); } -TEST_F(SparsityCsr, SquareMtxIsTransposable) +TYPED_TEST(SparsityCsr, SquareMtxIsTransposable) { + using Mtx = typename TestFixture::Mtx; // clang-format off - auto mtx2 = gko::initialize>( + auto mtx2 = gko::initialize( {{1.0, 1.0, 1.0}, {0.0, 1.0, 0.0}, - {0.0, 1.0, 1.0}}, exec); + {0.0, 1.0, 1.0}}, this->exec); // clang-format on auto trans = mtx2->transpose(); - auto trans_as_sparsity = - static_cast *>(trans.get()); + auto trans_as_sparsity = static_cast(trans.get()); // clang-format off GKO_ASSERT_MTX_NEAR(trans_as_sparsity, @@ -257,11 +279,11 @@ TEST_F(SparsityCsr, SquareMtxIsTransposable) } -TEST_F(SparsityCsr, NonSquareMtxIsTransposable) +TYPED_TEST(SparsityCsr, NonSquareMtxIsTransposable) { - auto trans = mtx->transpose(); - auto trans_as_sparsity = - static_cast *>(trans.get()); + using Mtx = typename TestFixture::Mtx; + auto trans = this->mtx->transpose(); + auto trans_as_sparsity = static_cast(trans.get()); // clang-format off GKO_ASSERT_MTX_NEAR(trans_as_sparsity, @@ -272,90 +294,86 @@ TEST_F(SparsityCsr, NonSquareMtxIsTransposable) } -TEST_F(SparsityCsr, CountsCorrectNumberOfDiagonalElements) +TYPED_TEST(SparsityCsr, CountsCorrectNumberOfDiagonalElements) { + using Mtx = typename TestFixture::Mtx; // clang-format off - auto mtx2 = gko::initialize>( - {{1.0, 1.0, 1.0}, - {0.0, 1.0, 0.0}, - {0.0, 1.0, 1.0}}, exec); - auto mtx_s = gko::initialize>( - {{1.0, 1.0, 1.0}, - {0.0, 0.0, 0.0}, - {0.0, 1.0, 1.0}}, exec); + auto mtx2 = gko::initialize({{1.0, 1.0, 1.0}, + {0.0, 1.0, 0.0}, + {0.0, 1.0, 1.0}}, this->exec); + auto mtx_s = gko::initialize({{1.0, 1.0, 1.0}, + {0.0, 0.0, 0.0}, + {0.0, 1.0, 1.0}}, this->exec); // clang-format on gko::size_type m2_num_diags = 0; gko::size_type ms_num_diags = 0; gko::kernels::reference::sparsity_csr::count_num_diagonal_elements( - exec, mtx2.get(), &m2_num_diags); + this->exec, mtx2.get(), &m2_num_diags); gko::kernels::reference::sparsity_csr::count_num_diagonal_elements( - exec, mtx_s.get(), &ms_num_diags); + this->exec, mtx_s.get(), &ms_num_diags); ASSERT_EQ(m2_num_diags, 3); ASSERT_EQ(ms_num_diags, 2); } -TEST_F(SparsityCsr, RemovesDiagonalElementsForFullRankMatrix) +TYPED_TEST(SparsityCsr, RemovesDiagonalElementsForFullRankMatrix) { + using Mtx = typename TestFixture::Mtx; // clang-format off - auto mtx2 = gko::initialize>( - {{1.0, 1.0, 1.0}, - {0.0, 1.0, 0.0}, - {0.0, 1.0, 1.0}}, exec); - auto mtx_s = gko::initialize>( - {{0.0, 1.0, 1.0}, - {0.0, 0.0, 0.0}, - {0.0, 1.0, 0.0}}, exec); + auto mtx2 = gko::initialize({{1.0, 1.0, 1.0}, + {0.0, 1.0, 0.0}, + {0.0, 1.0, 1.0}}, this->exec); + auto mtx_s = gko::initialize({{0.0, 1.0, 1.0}, + {0.0, 0.0, 0.0}, + {0.0, 1.0, 0.0}}, this->exec); // clang-format on - auto tmp_mtx = gko::matrix::SparsityCsr<>::create( - exec, mtx_s->get_size(), mtx_s->get_num_nonzeros()); + auto tmp_mtx = + Mtx::create(this->exec, mtx_s->get_size(), mtx_s->get_num_nonzeros()); tmp_mtx->copy_from(mtx2.get()); gko::kernels::reference::sparsity_csr::remove_diagonal_elements( - exec, tmp_mtx.get(), mtx2->get_const_row_ptrs(), - mtx2->get_const_col_idxs()); + this->exec, mtx2->get_const_row_ptrs(), mtx2->get_const_col_idxs(), + tmp_mtx.get()); GKO_ASSERT_MTX_NEAR(tmp_mtx.get(), mtx_s.get(), 0.0); } -TEST_F(SparsityCsr, RemovesDiagonalElementsForIncompleteRankMatrix) +TYPED_TEST(SparsityCsr, RemovesDiagonalElementsForIncompleteRankMatrix) { + using Mtx = typename TestFixture::Mtx; // clang-format off - auto mtx2 = gko::initialize>( - {{1.0, 1.0, 1.0}, - {0.0, 0.0, 0.0}, - {0.0, 1.0, 1.0}}, exec); - auto mtx_s = gko::initialize>( - {{0.0, 1.0, 1.0}, - {0.0, 0.0, 0.0}, - {0.0, 1.0, 0.0}}, exec); + auto mtx2 = gko::initialize({{1.0, 1.0, 1.0}, + {0.0, 0.0, 0.0}, + {0.0, 1.0, 1.0}}, this->exec); + auto mtx_s = gko::initialize({{0.0, 1.0, 1.0}, + {0.0, 0.0, 0.0}, + {0.0, 1.0, 0.0}}, this->exec); // clang-format on - auto tmp_mtx = gko::matrix::SparsityCsr<>::create( - exec, mtx_s->get_size(), mtx_s->get_num_nonzeros()); + auto tmp_mtx = + Mtx::create(this->exec, mtx_s->get_size(), mtx_s->get_num_nonzeros()); tmp_mtx->copy_from(mtx2.get()); gko::kernels::reference::sparsity_csr::remove_diagonal_elements( - exec, tmp_mtx.get(), mtx2->get_const_row_ptrs(), - mtx2->get_const_col_idxs()); + this->exec, mtx2->get_const_row_ptrs(), mtx2->get_const_col_idxs(), + tmp_mtx.get()); GKO_ASSERT_MTX_NEAR(tmp_mtx.get(), mtx_s.get(), 0.0); } -TEST_F(SparsityCsr, SquareMtxIsConvertibleToAdjacencyMatrix) +TYPED_TEST(SparsityCsr, SquareMtxIsConvertibleToAdjacencyMatrix) { + using Mtx = typename TestFixture::Mtx; // clang-format off - auto mtx2 = gko::initialize>( - {{1.0, 1.0, 1.0}, - {0.0, 1.0, 0.0}, - {0.0, 1.0, 1.0}}, exec); - auto mtx_s = gko::initialize>( - {{0.0, 1.0, 1.0}, - {0.0, 0.0, 0.0}, - {0.0, 1.0, 0.0}}, exec); + auto mtx2 = gko::initialize({{1.0, 1.0, 1.0}, + {0.0, 1.0, 0.0}, + {0.0, 1.0, 1.0}}, this->exec); + auto mtx_s = gko::initialize({{0.0, 1.0, 1.0}, + {0.0, 0.0, 0.0}, + {0.0, 1.0, 0.0}}, this->exec); // clang-format on auto adj_mat = mtx2->to_adjacency_matrix(); @@ -364,43 +382,43 @@ TEST_F(SparsityCsr, SquareMtxIsConvertibleToAdjacencyMatrix) } -TEST_F(SparsityCsr, NonSquareMtxIsNotConvertibleToAdjacencyMatrix) +TYPED_TEST(SparsityCsr, NonSquareMtxIsNotConvertibleToAdjacencyMatrix) { - ASSERT_THROW(mtx->to_adjacency_matrix(), gko::DimensionMismatch); + ASSERT_THROW(this->mtx->to_adjacency_matrix(), gko::DimensionMismatch); } -TEST_F(SparsityCsr, RecognizeSortedMatrix) +TYPED_TEST(SparsityCsr, RecognizeSortedMatrix) { - ASSERT_TRUE(mtx->is_sorted_by_column_index()); - ASSERT_TRUE(mtx2->is_sorted_by_column_index()); - ASSERT_TRUE(mtx3_sorted->is_sorted_by_column_index()); + ASSERT_TRUE(this->mtx->is_sorted_by_column_index()); + ASSERT_TRUE(this->mtx2->is_sorted_by_column_index()); + ASSERT_TRUE(this->mtx3_sorted->is_sorted_by_column_index()); } -TEST_F(SparsityCsr, RecognizeUnsortedMatrix) +TYPED_TEST(SparsityCsr, RecognizeUnsortedMatrix) { - ASSERT_FALSE(mtx3_unsorted->is_sorted_by_column_index()); + ASSERT_FALSE(this->mtx3_unsorted->is_sorted_by_column_index()); } -TEST_F(SparsityCsr, SortSortedMatrix) +TYPED_TEST(SparsityCsr, SortSortedMatrix) { - auto matrix = mtx3_sorted->clone(); + auto matrix = this->mtx3_sorted->clone(); matrix->sort_by_column_index(); - GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0); + GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0); } -TEST_F(SparsityCsr, SortUnsortedMatrix) +TYPED_TEST(SparsityCsr, SortUnsortedMatrix) { - auto matrix = mtx3_unsorted->clone(); + auto matrix = this->mtx3_unsorted->clone(); matrix->sort_by_column_index(); - GKO_ASSERT_MTX_NEAR(matrix, mtx3_sorted, 0.0); + GKO_ASSERT_MTX_NEAR(matrix, this->mtx3_sorted, 0.0); } diff --git a/reference/test/preconditioner/CMakeLists.txt b/reference/test/preconditioner/CMakeLists.txt index 89232e6e55b..908ac83533e 100644 --- a/reference/test/preconditioner/CMakeLists.txt +++ b/reference/test/preconditioner/CMakeLists.txt @@ -1,3 +1,4 @@ ginkgo_create_test(ilu) +ginkgo_create_test(isai_kernels) ginkgo_create_test(jacobi) ginkgo_create_test(jacobi_kernels) diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 2bea8d75c9c..b44791098c1 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -47,20 +47,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class Ilu : public ::testing::Test { protected: - using value_type = gko::default_precision; - using index_type = gko::int32; + using value_type = T; using Mtx = gko::matrix::Dense; using l_solver_type = gko::solver::Bicgstab; using u_solver_type = gko::solver::Bicgstab; @@ -69,16 +69,17 @@ class Ilu : public ::testing::Test { gko::preconditioner::Ilu; using ilu_rev_prec_type = gko::preconditioner::Ilu; - using composition = gko::Composition; + using Composition = gko::Composition; Ilu() : exec(gko::ReferenceExecutor::create()), - mtx(gko::initialize({{2, 1, 1}, {2, 5, 2}, {2, 5, 5}}, exec)), - l_factor( - gko::initialize({{1, 0, 0}, {1, 1, 0}, {1, 1, 1}}, exec)), - u_factor( - gko::initialize({{2, 1, 1}, {0, 4, 1}, {0, 0, 3}}, exec)), - l_u_composition(composition::create(l_factor, u_factor)), + mtx(gko::initialize({{2., 1., 1.}, {2., 5., 2.}, {2., 5., 5.}}, + exec)), + l_factor(gko::initialize( + {{1., 0., 0.}, {1., 1., 0.}, {1., 1., 1.}}, exec)), + u_factor(gko::initialize( + {{2., 1., 1.}, {0., 4., 1.}, {0., 0., 3.}}, exec)), + l_u_composition(Composition::create(l_factor, u_factor)), l_factory( l_solver_type::build() .with_criteria( @@ -87,8 +88,8 @@ class Ilu : public ::testing::Test { gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), u_factory( @@ -99,8 +100,8 @@ class Ilu : public ::testing::Test { gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), ilu_pre_factory(ilu_prec_type::build() @@ -117,79 +118,92 @@ class Ilu : public ::testing::Test { std::shared_ptr mtx; std::shared_ptr l_factor; std::shared_ptr u_factor; - std::shared_ptr l_u_composition; - std::shared_ptr l_factory; - std::shared_ptr u_factory; - std::shared_ptr ilu_pre_factory; - std::shared_ptr ilu_rev_pre_factory; + std::shared_ptr l_u_composition; + std::shared_ptr l_factory; + std::shared_ptr u_factory; + std::shared_ptr ilu_pre_factory; + std::shared_ptr ilu_rev_pre_factory; }; +TYPED_TEST_CASE(Ilu, gko::test::ValueTypes); -TEST_F(Ilu, BuildsDefaultWithoutThrowing) + +TYPED_TEST(Ilu, BuildsDefaultWithoutThrowing) { - auto ilu_pre_default_factory = ilu_prec_type::build().on(exec); + using ilu_prec_type = typename TestFixture::ilu_prec_type; + auto ilu_pre_default_factory = ilu_prec_type::build().on(this->exec); - ASSERT_NO_THROW(ilu_pre_default_factory->generate(l_u_composition)); + ASSERT_NO_THROW(ilu_pre_default_factory->generate(this->l_u_composition)); } -TEST_F(Ilu, BuildsCustomWithoutThrowing) +TYPED_TEST(Ilu, BuildsCustomWithoutThrowing) { - ASSERT_NO_THROW(ilu_pre_factory->generate(l_u_composition)); + ASSERT_NO_THROW(this->ilu_pre_factory->generate(this->l_u_composition)); } -TEST_F(Ilu, BuildsCustomWithoutThrowing2) +TYPED_TEST(Ilu, BuildsCustomWithoutThrowing2) { - ASSERT_NO_THROW(ilu_pre_factory->generate(mtx)); + ASSERT_NO_THROW(this->ilu_pre_factory->generate(this->mtx)); } -TEST_F(Ilu, ThrowOnWrongCompositionInput) +TYPED_TEST(Ilu, ThrowOnWrongCompositionInput) { - std::shared_ptr composition = composition::create(l_factor); + using Composition = typename TestFixture::Composition; + std::shared_ptr composition = + Composition::create(this->l_factor); - ASSERT_THROW(ilu_pre_factory->generate(composition), gko::NotSupported); + ASSERT_THROW(this->ilu_pre_factory->generate(composition), + gko::NotSupported); } -TEST_F(Ilu, ThrowOnWrongCompositionInput2) +TYPED_TEST(Ilu, ThrowOnWrongCompositionInput2) { - std::shared_ptr composition = - composition::create(l_factor, u_factor, l_factor); + using Composition = typename TestFixture::Composition; + std::shared_ptr composition = + Composition::create(this->l_factor, this->u_factor, this->l_factor); - ASSERT_THROW(ilu_pre_factory->generate(composition), gko::NotSupported); + ASSERT_THROW(this->ilu_pre_factory->generate(composition), + gko::NotSupported); } -TEST_F(Ilu, SetsCorrectMatrices) +TYPED_TEST(Ilu, SetsCorrectMatrices) { - auto ilu = ilu_pre_factory->generate(l_u_composition); + using Mtx = typename TestFixture::Mtx; + auto ilu = this->ilu_pre_factory->generate(this->l_u_composition); auto internal_l_factor = ilu->get_l_solver()->get_system_matrix(); auto internal_u_factor = ilu->get_u_solver()->get_system_matrix(); // These convert steps are required since `get_system_matrix` usually // just returns `LinOp`, which `GKO_ASSERT_MTX_NEAR` can not use properly - std::unique_ptr converted_l_factor{Mtx::create(exec)}; - std::unique_ptr converted_u_factor{Mtx::create(exec)}; + std::unique_ptr converted_l_factor{Mtx::create(this->exec)}; + std::unique_ptr converted_u_factor{Mtx::create(this->exec)}; gko::as>(internal_l_factor.get()) ->convert_to(converted_l_factor.get()); gko::as>(internal_u_factor.get()) ->convert_to(converted_u_factor.get()); - GKO_ASSERT_MTX_NEAR(converted_l_factor, l_factor, 0); - GKO_ASSERT_MTX_NEAR(converted_u_factor, u_factor, 0); + GKO_ASSERT_MTX_NEAR(converted_l_factor, this->l_factor, 0); + GKO_ASSERT_MTX_NEAR(converted_u_factor, this->u_factor, 0); } -TEST_F(Ilu, CanBeCopied) +TYPED_TEST(Ilu, CanBeCopied) { - auto ilu = ilu_pre_factory->generate(l_u_composition); + using Mtx = typename TestFixture::Mtx; + using ilu_prec_type = typename TestFixture::ilu_prec_type; + using Composition = typename TestFixture::Composition; + auto ilu = this->ilu_pre_factory->generate(this->l_u_composition); auto before_l_solver = ilu->get_l_solver(); auto before_u_solver = ilu->get_u_solver(); // The switch up of matrices is intentional, to make sure they are distinct! - auto u_l_composition = composition::create(u_factor, l_factor); - auto copied = - ilu_prec_type::build().on(exec)->generate(gko::share(u_l_composition)); + auto u_l_composition = Composition::create(this->u_factor, this->l_factor); + auto copied = ilu_prec_type::build() + .on(this->exec) + ->generate(gko::share(u_l_composition)); copied->copy_from(ilu.get()); @@ -198,15 +212,18 @@ TEST_F(Ilu, CanBeCopied) } -TEST_F(Ilu, CanBeMoved) +TYPED_TEST(Ilu, CanBeMoved) { - auto ilu = ilu_pre_factory->generate(l_u_composition); + using ilu_prec_type = typename TestFixture::ilu_prec_type; + using Composition = typename TestFixture::Composition; + auto ilu = this->ilu_pre_factory->generate(this->l_u_composition); auto before_l_solver = ilu->get_l_solver(); auto before_u_solver = ilu->get_u_solver(); // The switch up of matrices is intentional, to make sure they are distinct! - auto u_l_composition = composition::create(u_factor, l_factor); - auto moved = - ilu_prec_type::build().on(exec)->generate(gko::share(u_l_composition)); + auto u_l_composition = Composition::create(this->u_factor, this->l_factor); + auto moved = ilu_prec_type::build() + .on(this->exec) + ->generate(gko::share(u_l_composition)); moved->copy_from(std::move(ilu)); @@ -215,9 +232,9 @@ TEST_F(Ilu, CanBeMoved) } -TEST_F(Ilu, CanBeCloned) +TYPED_TEST(Ilu, CanBeCloned) { - auto ilu = ilu_pre_factory->generate(l_u_composition); + auto ilu = this->ilu_pre_factory->generate(this->l_u_composition); auto before_l_solver = ilu->get_l_solver(); auto before_u_solver = ilu->get_u_solver(); @@ -228,28 +245,60 @@ TEST_F(Ilu, CanBeCloned) } -TEST_F(Ilu, SolvesDefaultSingleRhs) +TYPED_TEST(Ilu, CanBeTransposed) { - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); - x->copy_from(b.get()); + using Ilu = typename TestFixture::ilu_prec_type; + using Mtx = typename TestFixture::Mtx; + auto ilu = this->ilu_pre_factory->generate(this->l_u_composition); + auto l_ref = gko::as(ilu->get_l_solver()->get_system_matrix()); + auto u_ref = gko::as(ilu->get_u_solver()->get_system_matrix()); + + auto transp = gko::as(ilu->transpose()); + + auto l_transp = gko::as( + gko::as(transp->get_u_solver()->get_system_matrix())->transpose()); + auto u_transp = gko::as( + gko::as(transp->get_l_solver()->get_system_matrix())->transpose()); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_transp); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_transp); + GKO_ASSERT_MTX_NEAR(l_ref, l_transp, 0); + GKO_ASSERT_MTX_NEAR(u_ref, u_transp, 0); +} - auto preconditioner = - default_ilu_prec_type::build().on(exec)->generate(mtx); - preconditioner->apply(b.get(), x.get()); - // Since it uses TRS per default, the result should be accurate - GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14); +TYPED_TEST(Ilu, CanBeConjTransposed) +{ + using Ilu = typename TestFixture::ilu_prec_type; + using Mtx = typename TestFixture::Mtx; + auto ilu = this->ilu_pre_factory->generate(this->l_u_composition); + auto l_ref = gko::as(ilu->get_l_solver()->get_system_matrix()); + auto u_ref = gko::as(ilu->get_u_solver()->get_system_matrix()); + + auto transp = gko::as(ilu->conj_transpose()); + + auto l_transp = + gko::as(gko::as(transp->get_u_solver()->get_system_matrix()) + ->conj_transpose()); + auto u_transp = + gko::as(gko::as(transp->get_l_solver()->get_system_matrix()) + ->conj_transpose()); + GKO_ASSERT_MTX_EQ_SPARSITY(l_ref, l_transp); + GKO_ASSERT_MTX_EQ_SPARSITY(u_ref, u_transp); + GKO_ASSERT_MTX_NEAR(l_ref, l_transp, 0); + GKO_ASSERT_MTX_NEAR(u_ref, u_transp, 0); } -TEST_F(Ilu, SolvesCustomTypeDefaultFactorySingleRhs) +TYPED_TEST(Ilu, SolvesCustomTypeDefaultFactorySingleRhs) { - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); + using ilu_prec_type = typename TestFixture::ilu_prec_type; + using Mtx = typename TestFixture::Mtx; + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b.get()); - auto preconditioner = ilu_prec_type::build().on(exec)->generate(mtx); + auto preconditioner = + ilu_prec_type::build().on(this->exec)->generate(this->mtx); preconditioner->apply(b.get(), x.get()); // Since it uses Bicgstab with default parmeters, the result will not be @@ -258,145 +307,201 @@ TEST_F(Ilu, SolvesCustomTypeDefaultFactorySingleRhs) } -TEST_F(Ilu, SolvesSingleRhsWithParIlu) +TYPED_TEST(Ilu, SolvesSingleRhsWithParIlu) { - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b.get()); auto par_ilu_fact = - gko::factorization::ParIlu::build().on(exec); - auto par_ilu = par_ilu_fact->generate(mtx); + gko::factorization::ParIlu::build().on(this->exec); + auto par_ilu = par_ilu_fact->generate(this->mtx); - auto preconditioner = ilu_pre_factory->generate(gko::share(par_ilu)); + auto preconditioner = this->ilu_pre_factory->generate(gko::share(par_ilu)); preconditioner->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), + r::value * 1e+1); } -TEST_F(Ilu, SolvesSingleRhsWithComposition) +TYPED_TEST(Ilu, SolvesSingleRhsWithComposition) { - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); + using Mtx = typename TestFixture::Mtx; + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b.get()); - auto preconditioner = ilu_pre_factory->generate(l_u_composition); + auto preconditioner = + this->ilu_pre_factory->generate(this->l_u_composition); preconditioner->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), + r::value * 1e+1); } -TEST_F(Ilu, SolvesSingleRhsWithMtx) +TYPED_TEST(Ilu, SolvesSingleRhsWithMtx) { - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); + using Mtx = typename TestFixture::Mtx; + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b.get()); - auto preconditioner = ilu_pre_factory->generate(mtx); + auto preconditioner = this->ilu_pre_factory->generate(this->mtx); preconditioner->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), + r::value * 1e+1); } -TEST_F(Ilu, SolvesReverseSingleRhs) +TYPED_TEST(Ilu, SolvesReverseSingleRhs) { - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); + using Mtx = typename TestFixture::Mtx; + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b.get()); - auto preconditioner = ilu_rev_pre_factory->generate(l_u_composition); + auto preconditioner = + this->ilu_rev_pre_factory->generate(this->l_u_composition); preconditioner->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x.get(), l({-0.625, 0.875, 1.75}), 1e-14); + GKO_ASSERT_MTX_NEAR(x.get(), l({-0.625, 0.875, 1.75}), + r::value * 1e+1); } -TEST_F(Ilu, SolvesAdvancedSingleRhs) +TYPED_TEST(Ilu, SolvesAdvancedSingleRhs) { + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; const value_type alpha{2.0}; - const auto alpha_linop = gko::initialize({alpha}, exec); + const auto alpha_linop = gko::initialize({alpha}, this->exec); const value_type beta{-1}; - const auto beta_linop = gko::initialize({beta}, exec); - const auto b = gko::initialize({-3.0, 6.0, 9.0}, exec); - auto x = gko::initialize({1.0, 2.0, 3.0}, exec); - auto preconditioner = ilu_pre_factory->generate(l_u_composition); + const auto beta_linop = gko::initialize({beta}, this->exec); + const auto b = gko::initialize({-3.0, 6.0, 9.0}, this->exec); + auto x = gko::initialize({1.0, 2.0, 3.0}, this->exec); + auto preconditioner = + this->ilu_pre_factory->generate(this->l_u_composition); preconditioner->apply(alpha_linop.get(), b.get(), beta_linop.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x.get(), l({-7.0, 2.0, -1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x.get(), l({-7.0, 2.0, -1.0}), r::value); } -TEST_F(Ilu, SolvesAdvancedReverseSingleRhs) +TYPED_TEST(Ilu, SolvesAdvancedReverseSingleRhs) { + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; const value_type alpha{2.0}; - const auto alpha_linop = gko::initialize({alpha}, exec); + const auto alpha_linop = gko::initialize({alpha}, this->exec); const value_type beta{-1}; - const auto beta_linop = gko::initialize({beta}, exec); - const auto b = gko::initialize({-3.0, 6.0, 9.0}, exec); - auto x = gko::initialize({1.0, 2.0, 3.0}, exec); - auto preconditioner = ilu_rev_pre_factory->generate(l_u_composition); + const auto beta_linop = gko::initialize({beta}, this->exec); + const auto b = gko::initialize({-3.0, 6.0, 9.0}, this->exec); + auto x = gko::initialize({1.0, 2.0, 3.0}, this->exec); + auto preconditioner = + this->ilu_rev_pre_factory->generate(this->l_u_composition); preconditioner->apply(alpha_linop.get(), b.get(), beta_linop.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x.get(), l({-7.75, 6.25, 1.5}), 1e-14); + GKO_ASSERT_MTX_NEAR(x.get(), l({-7.75, 6.25, 1.5}), + r::value * 1e+1); } -TEST_F(Ilu, SolvesMultipleRhs) +TYPED_TEST(Ilu, SolvesMultipleRhs) { - const auto b = - gko::initialize({{1.0, 8.0}, {3.0, 21.0}, {6.0, 24.0}}, exec); - auto x = Mtx::create(exec, gko::dim<2>{3, 2}); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + const auto b = gko::initialize( + {I{1.0, 8.0}, I{3.0, 21.0}, I{6.0, 24.0}}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 2}); x->copy_from(b.get()); - auto preconditioner = ilu_pre_factory->generate(l_u_composition); + auto preconditioner = + this->ilu_pre_factory->generate(this->l_u_composition); preconditioner->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR(x.get(), l({{-0.125, 2.0}, {0.25, 3.0}, {1.0, 1.0}}), - 1e-14); + r::value * 1e+1); } -TEST_F(Ilu, SolvesDifferentNumberOfRhs) +TYPED_TEST(Ilu, SolvesDifferentNumberOfRhs) { - const auto b1 = gko::initialize({-3.0, 6.0, 9.0}, exec); - auto x11 = Mtx::create(exec, gko::dim<2>{3, 1}); - auto x12 = Mtx::create(exec, gko::dim<2>{3, 1}); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + const auto b1 = gko::initialize({-3.0, 6.0, 9.0}, this->exec); + auto x11 = Mtx::create(this->exec, gko::dim<2>{3, 1}); + auto x12 = Mtx::create(this->exec, gko::dim<2>{3, 1}); x11->copy_from(b1.get()); x12->copy_from(b1.get()); - const auto b2 = - gko::initialize({{1.0, 8.0}, {3.0, 21.0}, {6.0, 24.0}}, exec); - auto x2 = Mtx::create(exec, gko::dim<2>{3, 2}); + const auto b2 = gko::initialize( + {I{1.0, 8.0}, I{3.0, 21.0}, I{6.0, 24.0}}, this->exec); + auto x2 = Mtx::create(this->exec, gko::dim<2>{3, 2}); x2->copy_from(b2.get()); - auto preconditioner = ilu_pre_factory->generate(l_u_composition); + auto preconditioner = + this->ilu_pre_factory->generate(this->l_u_composition); preconditioner->apply(b1.get(), x11.get()); preconditioner->apply(b2.get(), x2.get()); preconditioner->apply(b1.get(), x12.get()); - GKO_ASSERT_MTX_NEAR(x11.get(), l({-3.0, 2.0, 1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x11.get(), l({-3.0, 2.0, 1.0}), + r::value * 1e+1); GKO_ASSERT_MTX_NEAR(x2.get(), l({{-0.125, 2.0}, {0.25, 3.0}, {1.0, 1.0}}), - 1e-14); - GKO_ASSERT_MTX_NEAR(x12.get(), x11.get(), 1e-14); + r::value * 1e+1); + GKO_ASSERT_MTX_NEAR(x12.get(), x11.get(), r::value * 1e+1); +} + + +class DefaultIlu : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + using default_ilu_prec_type = gko::preconditioner::Ilu<>; + + DefaultIlu() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::initialize({{2., 1., 1.}, {2., 5., 2.}, {2., 5., 5.}}, + exec)) + {} + + std::shared_ptr exec; + std::shared_ptr mtx; +}; + + +TEST_F(DefaultIlu, SolvesDefaultSingleRhs) +{ + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); + x->copy_from(b.get()); + + auto preconditioner = + default_ilu_prec_type::build().on(this->exec)->generate(this->mtx); + preconditioner->apply(b.get(), x.get()); + + // Since it uses TRS per default, the result should be accurate + GKO_ASSERT_MTX_NEAR(x.get(), l({-0.125, 0.25, 1.0}), 1e-14); } -TEST_F(Ilu, CanBeUsedAsPreconditioner) +TEST_F(DefaultIlu, CanBeUsedAsPreconditioner) { auto solver = - gko::solver::Bicgstab::build() + gko::solver::Bicgstab<>::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) - .with_preconditioner(default_ilu_prec_type::build().on(exec)) - .on(exec) - ->generate(mtx); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); + gko::stop::Iteration::build().with_max_iters(2u).on(this->exec)) + .with_preconditioner(default_ilu_prec_type::build().on(this->exec)) + .on(this->exec) + ->generate(this->mtx); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); x->copy_from(b.get()); solver->apply(b.get(), x.get()); @@ -405,19 +510,19 @@ TEST_F(Ilu, CanBeUsedAsPreconditioner) } -TEST_F(Ilu, CanBeUsedAsGeneratedPreconditioner) +TEST_F(DefaultIlu, CanBeUsedAsGeneratedPreconditioner) { std::shared_ptr precond = - default_ilu_prec_type::build().on(exec)->generate(mtx); + default_ilu_prec_type::build().on(this->exec)->generate(this->mtx); auto solver = - gko::solver::Bicgstab::build() + gko::solver::Bicgstab<>::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) + gko::stop::Iteration::build().with_max_iters(2u).on(this->exec)) .with_generated_preconditioner(precond) - .on(exec) - ->generate(mtx); - auto x = Mtx::create(exec, gko::dim<2>{3, 1}); - const auto b = gko::initialize({1.0, 3.0, 6.0}, exec); + .on(this->exec) + ->generate(this->mtx); + auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); + const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); x->copy_from(b.get()); solver->apply(b.get(), x.get()); diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp new file mode 100644 index 00000000000..06ad239732d --- /dev/null +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -0,0 +1,924 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "reference/preconditioner/isai_kernels.cpp" + + +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/test/utils.hpp" +#include "matrices/config.hpp" + + +namespace { + + +template +class Isai : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using LowerIsai = gko::preconditioner::LowerIsai; + using UpperIsai = gko::preconditioner::UpperIsai; + using Mtx = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + using Csr = gko::matrix::Csr; + + Isai() + : exec{gko::ReferenceExecutor::create()}, + l_dense{gko::initialize( + {{2., 0., 0.}, {1., -2., 0.}, {-1., 1., -1.}}, exec)}, + l_dense_inv{gko::initialize( + {{.5, 0., 0.}, {.25, -.5, 0.}, {-.25, -.5, -1.}}, exec)}, + u_dense{gko::initialize( + {{4., 1., -1.}, {0., -2., 4.}, {0., 0., 8.}}, exec)}, + u_dense_inv{gko::initialize( + {{.25, .125, -0.03125}, {0., -.5, .25}, {0., 0., .125}}, exec)}, + l_csr{Csr::create(exec)}, + l_csr_inv{Csr::create(exec)}, + u_csr{Csr::create(exec)}, + u_csr_inv{Csr::create(exec)}, + l_sparse{Csr::create(exec, gko::dim<2>(4, 4), + I{-1., 2., 4., 5., -4., 8., -8.}, + I{0, 0, 1, 1, 2, 2, 3}, + I{0, 1, 3, 5, 7})}, + l_s_unsorted{Csr::create(exec, gko::dim<2>(4, 4), + I{-1., 4., 2., 5., -4., -8., 8.}, + I{0, 1, 0, 1, 2, 3, 2}, + I{0, 1, 3, 5, 7})}, + l_sparse_inv{ + Csr::create(exec, gko::dim<2>(4, 4), + I{-1., .5, .25, .3125, -.25, -.25, -.125}, + I{0, 0, 1, 1, 2, 2, 3}, + I{0, 1, 3, 5, 7})}, + l_sparse_inv2{Csr::create(exec, gko::dim<2>(4, 4), + I{-1., .5, .25, .625, .3125, + -.25, .3125, -.25, -.125}, + I{0, 0, 1, 0, 1, 2, 1, 2, 3}, + I{0, 1, 3, 6, 9})}, + l_sparse_inv3{ + Csr::create(exec, gko::dim<2>(4, 4), + I{-1., .5, .25, .625, .3125, -.25, .625, + .3125, -.25, -.125}, + I{0, 0, 1, 0, 1, 2, 0, 1, 2, 3}, + I{0, 1, 3, 6, 10})}, + l_sparse2{Csr::create(exec, gko::dim<2>(4, 4), + I{-2, 1, 4, 1, -2, 1, -1, 1, 2}, + I{0, 0, 1, 1, 2, 0, 1, 2, 3}, + I{0, 1, 3, 5, 9})}, + l_sparse2_inv{Csr::create(exec, gko::dim<2>(4, 4), + I{-.5, .125, .25, .125, -.5, + .28125, .0625, 0.25, 0.5}, + I{0, 0, 1, 1, 2, 0, 1, 2, 3}, + I{0, 1, 3, 5, 9})}, + u_sparse{ + Csr::create(exec, gko::dim<2>(4, 4), + I{-2., 1., -1., 1., 4., 1., -2., 1., 2.}, + I{0, 1, 2, 3, 1, 2, 2, 3, 3}, + I{0, 4, 6, 8, 9})}, + u_s_unsorted{ + Csr::create(exec, gko::dim<2>(4, 4), + I{-2., -1., 1., 1., 1., 4., -2., 1., 2.}, + I{0, 2, 1, 3, 2, 1, 2, 3, 3}, + I{0, 4, 6, 8, 9})}, + u_sparse_inv{Csr::create( + exec, gko::dim<2>(4, 4), + I{-.5, .125, .3125, .09375, .25, .125, -.5, .25, .5}, + I{0, 1, 2, 3, 1, 2, 2, 3, 3}, + I{0, 4, 6, 8, 9})}, + u_sparse_inv2{Csr::create(exec, gko::dim<2>(4, 4), + I{-.5, .125, .3125, .09375, .25, + .125, -.0625, -.5, .25, .5}, + I{0, 1, 2, 3, 1, 2, 3, 2, 3, 3}, + I{0, 4, 7, 9, 10})} + { + lower_isai_factory = LowerIsai::build().on(exec); + upper_isai_factory = UpperIsai::build().on(exec); + l_dense->convert_to(lend(l_csr)); + l_dense_inv->convert_to(lend(l_csr_inv)); + u_dense->convert_to(lend(u_csr)); + u_dense_inv->convert_to(lend(u_csr_inv)); + l_csr_longrow = read("isai_l.mtx"); + l_csr_longrow_e = read("isai_l_excess.mtx"); + l_csr_longrow_e_rhs = read("isai_l_excess_rhs.mtx"); + l_csr_longrow_inv_partial = read("isai_l_inv_partial.mtx"); + l_csr_longrow_inv = read("isai_l_inv.mtx"); + u_csr_longrow = read("isai_u.mtx"); + u_csr_longrow_e = read("isai_u_excess.mtx"); + u_csr_longrow_e_rhs = read("isai_u_excess_rhs.mtx"); + u_csr_longrow_inv_partial = read("isai_u_inv_partial.mtx"); + u_csr_longrow_inv = read("isai_u_inv.mtx"); + } + + template + std::unique_ptr read(const char *name) + { + std::ifstream mtxstream{std::string{gko::matrices::location_isai_mtxs} + + name}; + auto result = gko::read(mtxstream, exec); + // to avoid removing 0s, the matrices store 12345 instead + for (gko::size_type i = 0; i < result->get_num_stored_elements(); ++i) { + auto &val = result->get_values()[i]; + if (val == static_cast(12345.0)) { + val = 0; + } + } + return std::move(result); + } + + std::unique_ptr clone_allocations(const Csr *csr_mtx) + { + const auto num_elems = csr_mtx->get_num_stored_elements(); + auto sparsity = csr_mtx->clone(); + + // values are now filled with invalid data to catch potential errors + std::fill_n(sparsity->get_values(), num_elems, -gko::one()); + return sparsity; + } + + std::unique_ptr transpose(const Csr *mtx) + { + return gko::as(mtx->transpose()); + } + + std::shared_ptr exec; + std::unique_ptr lower_isai_factory; + std::unique_ptr upper_isai_factory; + std::shared_ptr l_dense; + std::shared_ptr l_dense_inv; + std::shared_ptr u_dense; + std::shared_ptr u_dense_inv; + std::shared_ptr l_csr; + std::shared_ptr l_csr_inv; + std::shared_ptr l_csr_longrow; + std::shared_ptr l_csr_longrow_e; + std::shared_ptr l_csr_longrow_e_rhs; + std::shared_ptr l_csr_longrow_inv_partial; + std::shared_ptr l_csr_longrow_inv; + std::shared_ptr u_csr; + std::shared_ptr u_csr_inv; + std::shared_ptr u_csr_longrow; + std::shared_ptr u_csr_longrow_e; + std::shared_ptr u_csr_longrow_e_rhs; + std::shared_ptr u_csr_longrow_inv_partial; + std::shared_ptr u_csr_longrow_inv; + std::shared_ptr l_sparse; + std::shared_ptr l_s_unsorted; + std::shared_ptr l_sparse_inv; + std::shared_ptr l_sparse_inv2; + std::shared_ptr l_sparse_inv3; + std::shared_ptr l_sparse2; + std::shared_ptr l_sparse2_inv; + std::shared_ptr u_sparse; + std::shared_ptr u_s_unsorted; + std::shared_ptr u_sparse_inv; + std::shared_ptr u_sparse_inv2; +}; + +TYPED_TEST_CASE(Isai, gko::test::ValueIndexTypes); + + +TYPED_TEST(Isai, KernelGenerateL1) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->l_csr)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->l_csr), lend(result), a1.get_data(), + a2.get_data(), true); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_csr_inv); + GKO_ASSERT_MTX_NEAR(result, this->l_csr_inv, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateL2) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + const auto l_mtx = this->transpose(lend(this->u_csr)); + auto result = this->clone_allocations(lend(l_mtx)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(l_mtx), lend(result), a1.get_data(), a2.get_data(), + true); + + const auto expected = this->transpose(lend(this->u_csr_inv)); + GKO_ASSERT_MTX_EQ_SPARSITY(result, expected); + GKO_ASSERT_MTX_NEAR(result, expected, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateLsparse1) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->l_sparse)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->l_sparse), lend(result), a1.get_data(), + a2.get_data(), true); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_sparse_inv); + GKO_ASSERT_MTX_NEAR(result, this->l_sparse_inv, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateLsparse2) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->l_sparse2)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->l_sparse2), lend(result), a1.get_data(), + a2.get_data(), true); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_sparse2_inv); + GKO_ASSERT_MTX_NEAR(result, this->l_sparse2_inv, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateLsparse3) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + const auto l_mtx = this->transpose(lend(this->u_sparse)); + auto result = this->clone_allocations(lend(l_mtx)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(l_mtx), lend(result), a1.get_data(), a2.get_data(), + true); + + // Results in a slightly different version than u_sparse_inv->transpose() + // because a different row-sparsity pattern is used in u_sparse vs. l_mtx + // (only one value changes compared to u_sparse_inv->transpose()) + const auto expected = Csr::create( + this->exec, gko::dim<2>(4, 4), + I{-.5, .125, .25, .3125, .125, -.5, .125, .25, .5}, + I{0, 0, 1, 0, 1, 2, 0, 2, 3}, I{0, 1, 3, 6, 9}); + GKO_ASSERT_MTX_EQ_SPARSITY(result, expected); + GKO_ASSERT_MTX_NEAR(result, expected, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateLLongrow) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->l_csr_longrow)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + // only the 32nd row has some excess storage + auto a1_expect = zeros; + a1_expect.get_data()[33] = 33; + a1_expect.get_data()[34] = 33; + a1_expect.get_data()[35] = 66; + auto a2_expect = zeros; + a2_expect.get_data()[33] = 124; + a2_expect.get_data()[34] = 124; + a2_expect.get_data()[35] = 248; + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->l_csr_longrow), lend(result), a1.get_data(), + a2.get_data(), true); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_csr_longrow_inv_partial); + GKO_ASSERT_MTX_NEAR(result, this->l_csr_longrow_inv_partial, + r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, a1_expect); + GKO_ASSERT_ARRAY_EQ(a2, a2_expect); +} + + +TYPED_TEST(Isai, KernelGenerateExcessLLongrow) +{ + using Csr = typename TestFixture::Csr; + using Dense = typename TestFixture::Dense; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto num_rows = this->l_csr_longrow->get_size()[0]; + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + // only the 32nd row has some excess storage + auto a1 = zeros; + a1.get_data()[33] = 33; + a1.get_data()[34] = 33; + a1.get_data()[35] = 66; + auto a2 = zeros; + a2.get_data()[33] = 124; + a2.get_data()[34] = 124; + a2.get_data()[35] = 248; + auto result = Csr::create(this->exec, gko::dim<2>(66, 66), 248); + auto result_rhs = Dense::create(this->exec, gko::dim<2>(66, 1)); + + gko::kernels::reference::isai::generate_excess_system( + this->exec, lend(this->l_csr_longrow), lend(this->l_csr_longrow), + a1.get_const_data(), a2.get_const_data(), lend(result), + lend(result_rhs)); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->l_csr_longrow_e); + GKO_ASSERT_MTX_NEAR(result, this->l_csr_longrow_e, 0); + GKO_ASSERT_MTX_NEAR(result_rhs, this->l_csr_longrow_e_rhs, 0); +} + + +TYPED_TEST(Isai, KernelGenerateU1) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + const auto u_mtx = this->transpose(lend(this->l_csr)); + auto result = this->clone_allocations(lend(u_mtx)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(u_mtx), lend(result), a1.get_data(), a2.get_data(), + false); + + auto expected = this->transpose(lend(this->l_csr_inv)); + GKO_ASSERT_MTX_EQ_SPARSITY(result, expected); + GKO_ASSERT_MTX_NEAR(result, expected, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateU2) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->u_csr)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->u_csr), lend(result), a1.get_data(), + a2.get_data(), false); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_csr_inv); + GKO_ASSERT_MTX_NEAR(result, this->u_csr_inv, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateUsparse1) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + const auto u_mtx = this->transpose(lend(this->l_sparse)); + auto result = this->clone_allocations(lend(u_mtx)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(u_mtx), lend(result), a1.get_data(), a2.get_data(), + false); + + const auto expected = this->transpose(lend(this->l_sparse_inv)); + GKO_ASSERT_MTX_EQ_SPARSITY(result, expected); + GKO_ASSERT_MTX_NEAR(result, expected, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateUsparse2) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + const auto u_mtx = this->transpose(this->l_sparse2.get()); + auto result = this->clone_allocations(lend(u_mtx)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(u_mtx), lend(result), a1.get_data(), a2.get_data(), + false); + + // Results in a slightly different version than l_sparse2_inv->transpose() + // because a different row-sparsity pattern is used in l_sparse2 vs. u_mtx + // (only one value changes compared to l_sparse2_inv->transpose()) + const auto expected = Csr::create( + this->exec, gko::dim<2>(4, 4), + I{-.5, .125, .3125, .25, .125, .0625, -.5, .25, .5}, + I{0, 1, 3, 1, 2, 3, 2, 3, 3}, I{0, 3, 6, 8, 9}); + GKO_ASSERT_MTX_EQ_SPARSITY(result, expected); + GKO_ASSERT_MTX_NEAR(result, expected, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateUsparse3) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->u_sparse)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->u_sparse), lend(result), a1.get_data(), + a2.get_data(), false); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_sparse_inv); + GKO_ASSERT_MTX_NEAR(result, this->u_sparse_inv, r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, zeros); + GKO_ASSERT_ARRAY_EQ(a2, zeros); +} + + +TYPED_TEST(Isai, KernelGenerateULongrow) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto result = this->clone_allocations(lend(this->u_csr_longrow)); + auto num_rows = result->get_size()[0]; + gko::Array a1(this->exec, num_rows + 1); + gko::Array a2(this->exec, num_rows + 1); + // zero-filled array + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + // only the 32nd row has some excess storage + auto a1_expect = zeros; + std::fill_n(a1_expect.get_data() + 3, 33, 33); + auto a2_expect = zeros; + std::fill_n(a2_expect.get_data() + 3, 33, 153); + + gko::kernels::reference::isai::generate_tri_inverse( + this->exec, lend(this->u_csr_longrow), lend(result), a1.get_data(), + a2.get_data(), false); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_csr_longrow_inv_partial); + GKO_ASSERT_MTX_NEAR(result, this->u_csr_longrow_inv_partial, + r::value); + // no row above the size limit -> zero array + GKO_ASSERT_ARRAY_EQ(a1, a1_expect); + GKO_ASSERT_ARRAY_EQ(a2, a2_expect); +} + + +TYPED_TEST(Isai, KernelGenerateExcessULongrow) +{ + using Csr = typename TestFixture::Csr; + using Dense = typename TestFixture::Dense; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto num_rows = this->u_csr_longrow->get_size()[0]; + gko::Array zeros(this->exec, num_rows + 1); + std::fill_n(zeros.get_data(), num_rows + 1, 0); + // only the 32nd row has some excess storage + auto a1 = zeros; + std::fill_n(a1.get_data() + 3, 33, 33); + auto a2 = zeros; + std::fill_n(a2.get_data() + 3, 33, 153); + auto result = Csr::create(this->exec, gko::dim<2>(33, 33), 153); + auto result_rhs = Dense::create(this->exec, gko::dim<2>(33, 1)); + + gko::kernels::reference::isai::generate_excess_system( + this->exec, lend(this->u_csr_longrow), lend(this->u_csr_longrow), + a1.get_const_data(), a2.get_const_data(), lend(result), + lend(result_rhs)); + + GKO_ASSERT_MTX_EQ_SPARSITY(result, this->u_csr_longrow_e); + GKO_ASSERT_MTX_NEAR(result, this->u_csr_longrow_e, 0); + GKO_ASSERT_MTX_NEAR(result_rhs, this->u_csr_longrow_e_rhs, 0); +} + + +TYPED_TEST(Isai, KernelScatterExcessSolution) +{ + using Csr = typename TestFixture::Csr; + using Dense = typename TestFixture::Dense; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + gko::Array ptrs{this->exec, I{0, 0, 2, 2, 5, 7, 7}}; + auto mtx = Csr::create(this->exec, gko::dim<2>{6, 6}, + I{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + I{0, 0, 1, 0, 0, 1, 2, 0, 1, 0}, + I{0, 1, 3, 4, 7, 9, 10}); + auto expect = + Csr::create(this->exec, gko::dim<2>{6, 6}, + I{1, 11, 12, 4, 13, 14, 15, 16, 17, 10}, + I{0, 0, 1, 0, 0, 1, 2, 0, 1, 0}, + I{0, 1, 3, 4, 7, 9, 10}); + auto sol = Dense::create(this->exec, gko::dim<2>(7, 1), + I{11, 12, 13, 14, 15, 16, 17}, 1); + + gko::kernels::reference::isai::scatter_excess_solution( + this->exec, ptrs.get_const_data(), sol.get(), mtx.get()); + + GKO_ASSERT_MTX_NEAR(mtx, expect, 0); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseL) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->lower_isai_factory->generate(this->l_sparse); + + auto l_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv); + GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseLLongrow) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->lower_isai_factory->generate(this->l_csr_longrow); + + auto l_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_csr_longrow_inv); + GKO_ASSERT_MTX_NEAR(l_inv, this->l_csr_longrow_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseU) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->upper_isai_factory->generate(this->u_sparse); + + auto u_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseULongrow) +{ + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->upper_isai_factory->generate(this->u_csr_longrow); + + auto u_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_csr_longrow_inv); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_csr_longrow_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseLWithL2) +{ + using value_type = typename TestFixture::value_type; + const auto isai = TestFixture::LowerIsai::build() + .with_sparsity_power(2) + .on(this->exec) + ->generate(this->l_sparse); + + auto l_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv2); + GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv2, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseUWithU2) +{ + using value_type = typename TestFixture::value_type; + const auto isai = TestFixture::UpperIsai::build() + .with_sparsity_power(2) + .on(this->exec) + ->generate(this->u_sparse); + + auto u_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv2); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv2, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseLWithL3) +{ + using value_type = typename TestFixture::value_type; + const auto isai = TestFixture::LowerIsai::build() + .with_sparsity_power(3) + .on(this->exec) + ->generate(this->l_sparse); + + auto l_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv3); + GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv3, r::value); +} + + +TYPED_TEST(Isai, ReturnsCorrectInverseUWithU3) +{ + using value_type = typename TestFixture::value_type; + const auto isai = TestFixture::UpperIsai::build() + .with_sparsity_power(3) + .on(this->exec) + ->generate(this->u_sparse); + + auto u_inv = isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv2); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv2, r::value); +} + + +TYPED_TEST(Isai, GeneratesWithUnsortedCsr) +{ + using Csr = typename TestFixture::Csr; + using T = typename TestFixture::value_type; + + const auto l_isai = this->lower_isai_factory->generate(this->l_s_unsorted); + const auto u_isai = this->upper_isai_factory->generate(this->u_s_unsorted); + auto l_inv = l_isai->get_approximate_inverse(); + auto u_inv = u_isai->get_approximate_inverse(); + + GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r::value); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r::value); +} + + +TYPED_TEST(Isai, ApplyWithLMtx) +{ + using Dense = typename TestFixture::Dense; + using T = typename TestFixture::value_type; + const auto vec = gko::initialize({18., 16., 12.}, this->exec); + auto result = Dense::create_with_config_of(lend(vec)); + const auto l_isai = this->lower_isai_factory->generate(this->l_dense); + + l_isai->apply(lend(vec), lend(result)); + + GKO_ASSERT_MTX_NEAR(result, l({9., -3.5, -24.5}), r::value); +} + + +TYPED_TEST(Isai, ApplyWithUMtx) +{ + using Dense = typename TestFixture::Dense; + using T = typename TestFixture::value_type; + const auto vec = gko::initialize({18., 16., 12.}, this->exec); + auto result = Dense::create_with_config_of(lend(vec)); + const auto u_isai = this->upper_isai_factory->generate(this->u_dense); + + u_isai->apply(lend(vec), lend(result)); + + GKO_ASSERT_MTX_NEAR(result, l({6.125, -5., 1.5}), r::value); +} + + +TYPED_TEST(Isai, AdvancedApplyLMtx) +{ + using Dense = typename TestFixture::Dense; + using T = typename TestFixture::value_type; + const auto alpha = gko::initialize({3.}, this->exec); + const auto beta = gko::initialize({-4.}, this->exec); + const auto vec = gko::initialize({18., 16., 12}, this->exec); + auto result = gko::initialize({2., -3., 1.}, this->exec); + const auto l_isai = this->lower_isai_factory->generate(this->l_dense); + + l_isai->apply(lend(alpha), lend(vec), lend(beta), lend(result)); + + GKO_ASSERT_MTX_NEAR(result, l({19., 1.5, -77.5}), r::value); +} + + +TYPED_TEST(Isai, AdvancedApplyUMtx) +{ + using Dense = typename TestFixture::Dense; + using T = typename TestFixture::value_type; + const auto alpha = gko::initialize({3.}, this->exec); + const auto beta = gko::initialize({-4.}, this->exec); + const auto vec = gko::initialize({18., 16., 12}, this->exec); + auto result = gko::initialize({2., -3., 1.}, this->exec); + const auto u_isai = this->upper_isai_factory->generate(this->u_dense); + + u_isai->apply(lend(alpha), lend(vec), lend(beta), lend(result)); + + GKO_ASSERT_MTX_NEAR(result, l({10.375, -3., 0.5}), r::value); +} + + +TYPED_TEST(Isai, UseWithIluPreconditioner) +{ + using Dense = typename TestFixture::Dense; + using index_type = typename TestFixture::index_type; + using T = typename TestFixture::value_type; + using LowerIsai = typename TestFixture::LowerIsai; + using UpperIsai = typename TestFixture::UpperIsai; + const auto vec = gko::initialize({128, -64, 32}, this->exec); + auto result = Dense::create(this->exec, vec->get_size()); + auto mtx = gko::share(Dense::create_with_config_of(lend(this->l_dense))); + this->l_dense->apply(lend(this->u_dense), lend(mtx)); + auto ilu_factory = gko::preconditioner::Ilu::build() + .on(this->exec); + auto ilu = ilu_factory->generate(mtx); + + ilu->apply(lend(vec), lend(result)); + + GKO_ASSERT_MTX_NEAR(result, l({25., -40., -4.}), r::value); +} + + +TYPED_TEST(Isai, ReturnsTransposedCorrectInverseL) +{ + using UpperIsai = typename TestFixture::UpperIsai; + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->lower_isai_factory->generate(this->l_sparse); + + auto l_inv = gko::as(gko::as(isai->transpose()) + ->get_approximate_inverse() + ->transpose()); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv); + GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsTransposedCorrectInverseU) +{ + using LowerIsai = typename TestFixture::LowerIsai; + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->upper_isai_factory->generate(this->u_sparse); + + auto u_inv = gko::as(gko::as(isai->transpose()) + ->get_approximate_inverse() + ->transpose()); + + GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsConjTransposedCorrectInverseL) +{ + using UpperIsai = typename TestFixture::UpperIsai; + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->lower_isai_factory->generate(this->l_sparse); + + auto l_inv = gko::as(gko::as(isai->conj_transpose()) + ->get_approximate_inverse() + ->conj_transpose()); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_inv, this->l_sparse_inv); + GKO_ASSERT_MTX_NEAR(l_inv, this->l_sparse_inv, r::value); +} + + +TYPED_TEST(Isai, ReturnsConjTransposedCorrectInverseU) +{ + using LowerIsai = typename TestFixture::LowerIsai; + using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + const auto isai = this->upper_isai_factory->generate(this->u_sparse); + + auto u_inv = gko::as(gko::as(isai->conj_transpose()) + ->get_approximate_inverse() + ->conj_transpose()); + + GKO_ASSERT_MTX_EQ_SPARSITY(u_inv, this->u_sparse_inv); + GKO_ASSERT_MTX_NEAR(u_inv, this->u_sparse_inv, r::value); +} + + +} // namespace diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 767bcbd5028..29d5ddd477f 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,27 +39,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include #include +#include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/test/utils.hpp" + + namespace { +template class Jacobi : public ::testing::Test { protected: - using Bj = gko::preconditioner::Jacobi<>; - using Mtx = gko::matrix::Csr<>; - using Vec = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Bj = gko::preconditioner::Jacobi; + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; Jacobi() : exec(gko::ReferenceExecutor::create()), bj_factory(Bj::build().with_max_block_size(3u).on(exec)), block_pointers(exec, 3), block_precisions(exec, 2), - mtx(gko::matrix::Csr<>::create(exec, gko::dim<2>{5}, 13)) + mtx(Mtx::create(exec, gko::dim<2>{5}, 13)) { block_pointers.get_data()[0] = 0; block_pointers.get_data()[1] = 2; @@ -74,11 +82,12 @@ class Jacobi : public ::testing::Test { |-1 4 -2 -1 | -1 4 */ - init_array(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13}); - init_array(mtx->get_col_idxs(), - {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4}); - init_array(mtx->get_values(), {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0, - -1.0, 4.0, -2.0, -1.0, -1.0, 4.0}); + init_array(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13}); + init_array(mtx->get_col_idxs(), + {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4}); + init_array(mtx->get_values(), + {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0, -1.0, + 4.0, -2.0, -1.0, -1.0, 4.0}); bj_factory = Bj::build() .with_max_block_size(3u) .with_block_pointers(block_pointers) @@ -109,8 +118,8 @@ class Jacobi : public ::testing::Test { { for (int i = 0; i < block_size; ++i) { for (int j = 0; j < block_size; ++j) { - EXPECT_EQ(static_cast(ptr_a[i * stride_a + j]), - static_cast(ptr_b[i * stride_b + j])) + EXPECT_EQ(static_cast(ptr_a[i * stride_a + j]), + static_cast(ptr_b[i * stride_b + j])) << "Mismatch at position (" << i << ", " << j << ")"; } } @@ -143,7 +152,7 @@ class Jacobi : public ::testing::Test { ASSERT_EQ(prec_a, prec_b); auto scheme = a->get_storage_scheme(); GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - Bj::value_type, prec_a, + value_type, prec_a, assert_same_block( b_ptr_a[i + 1] - b_ptr_a[i], reinterpret_cast( @@ -158,19 +167,21 @@ class Jacobi : public ::testing::Test { } std::shared_ptr exec; - std::unique_ptr bj_factory; - std::unique_ptr adaptive_bj_factory; - gko::Array block_pointers; + std::unique_ptr bj_factory; + std::unique_ptr adaptive_bj_factory; + gko::Array block_pointers; gko::Array block_precisions; - std::shared_ptr> mtx; + std::shared_ptr mtx; std::unique_ptr bj; std::unique_ptr adaptive_bj; }; +TYPED_TEST_CASE(Jacobi, gko::test::ValueIndexTypes); + -TEST_F(Jacobi, GeneratesCorrectStorageScheme) +TYPED_TEST(Jacobi, GeneratesCorrectStorageScheme) { - auto scheme = bj->get_storage_scheme(); + auto scheme = this->bj->get_storage_scheme(); ASSERT_EQ(scheme.group_power, 3); // 8 3-by-3 blocks fit into 32-wide group ASSERT_EQ(scheme.block_offset, 3); @@ -178,156 +189,185 @@ TEST_F(Jacobi, GeneratesCorrectStorageScheme) } -TEST_F(Jacobi, CanBeCloned) +TYPED_TEST(Jacobi, CanBeCloned) { - auto bj_clone = clone(bj); + auto bj_clone = clone(this->bj); - assert_same_precond(lend(bj_clone), lend(bj)); + this->assert_same_precond(lend(bj_clone), lend(this->bj)); } -TEST_F(Jacobi, CanBeClonedWithAdaptvePrecision) +TYPED_TEST(Jacobi, CanBeClonedWithAdaptvePrecision) { - auto bj_clone = clone(adaptive_bj); - assert_same_precond(lend(bj_clone), lend(adaptive_bj)); + auto bj_clone = clone(this->adaptive_bj); + this->assert_same_precond(lend(bj_clone), lend(this->adaptive_bj)); } -TEST_F(Jacobi, CanBeCopied) +TYPED_TEST(Jacobi, CanBeCopied) { - gko::Array empty(exec, 1); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + gko::Array empty(this->exec, 1); empty.get_data()[0] = 0; - auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate( - Mtx::create(exec)); + auto copy = Bj::build() + .with_block_pointers(empty) + .on(this->exec) + ->generate(Mtx::create(this->exec)); - copy->copy_from(lend(bj)); + copy->copy_from(lend(this->bj)); - assert_same_precond(lend(copy), lend(bj)); + this->assert_same_precond(lend(copy), lend(this->bj)); } -TEST_F(Jacobi, CanBeCopiedWithAdaptivePrecision) +TYPED_TEST(Jacobi, CanBeCopiedWithAdaptivePrecision) { - gko::Array empty(exec, 1); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + gko::Array empty(this->exec, 1); empty.get_data()[0] = 0; - auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate( - Mtx::create(exec)); + auto copy = Bj::build() + .with_block_pointers(empty) + .on(this->exec) + ->generate(Mtx::create(this->exec)); - copy->copy_from(lend(adaptive_bj)); + copy->copy_from(lend(this->adaptive_bj)); - assert_same_precond(lend(copy), lend(adaptive_bj)); + this->assert_same_precond(lend(copy), lend(this->adaptive_bj)); } -TEST_F(Jacobi, CanBeMoved) +TYPED_TEST(Jacobi, CanBeMoved) { - auto tmp = clone(bj); - gko::Array empty(exec, 1); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto tmp = clone(this->bj); + gko::Array empty(this->exec, 1); empty.get_data()[0] = 0; - auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate( - Mtx::create(exec)); + auto copy = Bj::build() + .with_block_pointers(empty) + .on(this->exec) + ->generate(Mtx::create(this->exec)); - copy->copy_from(give(bj)); + copy->copy_from(give(this->bj)); - assert_same_precond(lend(copy), lend(tmp)); + this->assert_same_precond(lend(copy), lend(tmp)); } -TEST_F(Jacobi, CanBeMovedWithAdaptivePrecision) +TYPED_TEST(Jacobi, CanBeMovedWithAdaptivePrecision) { - auto tmp = clone(adaptive_bj); - gko::Array empty(exec, 1); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto tmp = clone(this->adaptive_bj); + gko::Array empty(this->exec, 1); empty.get_data()[0] = 0; - auto copy = Bj::build().with_block_pointers(empty).on(exec)->generate( - Mtx::create(exec)); + auto copy = Bj::build() + .with_block_pointers(empty) + .on(this->exec) + ->generate(Mtx::create(this->exec)); - copy->copy_from(give(adaptive_bj)); + copy->copy_from(give(this->adaptive_bj)); - assert_same_precond(lend(copy), lend(tmp)); + this->assert_same_precond(lend(copy), lend(tmp)); } -TEST_F(Jacobi, CanBeCleared) +TYPED_TEST(Jacobi, CanBeCleared) { - bj->clear(); + this->bj->clear(); - ASSERT_EQ(bj->get_size(), gko::dim<2>(0, 0)); - ASSERT_EQ(bj->get_num_stored_elements(), 0); - ASSERT_EQ(bj->get_parameters().max_block_size, 32); - ASSERT_EQ(bj->get_parameters().block_pointers.get_const_data(), nullptr); - ASSERT_EQ(bj->get_blocks(), nullptr); + ASSERT_EQ(this->bj->get_size(), gko::dim<2>(0, 0)); + ASSERT_EQ(this->bj->get_num_stored_elements(), 0); + ASSERT_EQ(this->bj->get_parameters().max_block_size, 32); + ASSERT_EQ(this->bj->get_parameters().block_pointers.get_const_data(), + nullptr); + ASSERT_EQ(this->bj->get_blocks(), nullptr); } -TEST_F(Jacobi, CanBeClearedWithAdaptivePrecision) +TYPED_TEST(Jacobi, CanBeClearedWithAdaptivePrecision) { - adaptive_bj->clear(); - - ASSERT_EQ(adaptive_bj->get_size(), gko::dim<2>(0, 0)); - ASSERT_EQ(adaptive_bj->get_num_stored_elements(), 0); - ASSERT_EQ(adaptive_bj->get_parameters().max_block_size, 32); - ASSERT_EQ(adaptive_bj->get_parameters().block_pointers.get_const_data(), - nullptr); - ASSERT_EQ(adaptive_bj->get_parameters() + this->adaptive_bj->clear(); + + ASSERT_EQ(this->adaptive_bj->get_size(), gko::dim<2>(0, 0)); + ASSERT_EQ(this->adaptive_bj->get_num_stored_elements(), 0); + ASSERT_EQ(this->adaptive_bj->get_parameters().max_block_size, 32); + ASSERT_EQ( + this->adaptive_bj->get_parameters().block_pointers.get_const_data(), + nullptr); + ASSERT_EQ(this->adaptive_bj->get_parameters() .storage_optimization.block_wise.get_const_data(), nullptr); - ASSERT_EQ(adaptive_bj->get_blocks(), nullptr); + ASSERT_EQ(this->adaptive_bj->get_blocks(), nullptr); } #define GKO_EXPECT_NONZERO_NEAR(first, second, tol) \ EXPECT_EQ(first.row, second.row); \ EXPECT_EQ(first.column, second.column); \ - EXPECT_NEAR(first.value, second.value, tol) + GKO_EXPECT_NEAR(first.value, second.value, tol) -TEST_F(Jacobi, GeneratesCorrectMatrixData) +TYPED_TEST(Jacobi, GeneratesCorrectMatrixData) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + auto tol = r::value; + gko::matrix_data data; - bj->write(data); + this->bj->write(data); ASSERT_EQ(data.size, gko::dim<2>{5}); ASSERT_EQ(data.nonzeros.size(), 13); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), 1e-14); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), tol); } -TEST_F(Jacobi, GeneratesCorrectMatrixDataWithAdaptivePrecision) +TYPED_TEST(Jacobi, GeneratesCorrectMatrixDataWithAdaptivePrecision) { - using tpl = gko::matrix_data<>::nonzero_type; - gko::matrix_data<> data; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using tpl = typename gko::matrix_data::nonzero_type; + gko::matrix_data data; + auto tol = r::value; + auto half_tol = std::sqrt(r::value); - adaptive_bj->write(data); + this->adaptive_bj->write(data); ASSERT_EQ(data.size, gko::dim<2>{5}); ASSERT_EQ(data.nonzeros.size(), 13); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), 1e-7); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), 1e-7); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), 1e-7); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), 1e-7); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), 1e-14); - GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), 1e-14); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[0], tpl(0, 0, 4.0 / 14), half_tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[1], tpl(0, 1, 2.0 / 14), half_tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[2], tpl(1, 0, 1.0 / 14), half_tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[3], tpl(1, 1, 4.0 / 14), half_tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[4], tpl(2, 2, 14.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[5], tpl(2, 3, 8.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[6], tpl(2, 4, 4.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[7], tpl(3, 2, 4.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[8], tpl(3, 3, 16.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[9], tpl(3, 4, 8.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[10], tpl(4, 2, 1.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[11], tpl(4, 3, 4.0 / 48), tol); + GKO_EXPECT_NONZERO_NEAR(data.nonzeros[12], tpl(4, 4, 14.0 / 48), tol); } diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index be1cf1178f7..9fc4a028f27 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,31 +34,41 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include -#include #include #include +#include "core/base/extended_float.hpp" +#include "core/test/utils.hpp" + + namespace { +template class Jacobi : public ::testing::Test { protected: - using Bj = gko::preconditioner::Jacobi<>; - using Mtx = gko::matrix::Csr<>; - using Vec = gko::matrix::Dense<>; - using mdata = gko::matrix_data<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Bj = gko::preconditioner::Jacobi; + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + using mdata = gko::matrix_data; Jacobi() : exec(gko::ReferenceExecutor::create()), block_pointers(exec, 3), block_precisions(exec, 2), - mtx(gko::matrix::Csr<>::create(exec, gko::dim<2>{5}, 13)) + mtx(gko::matrix::Csr::create( + exec, gko::dim<2>{5}, 13)) { block_pointers.get_data()[0] = 0; block_pointers.get_data()[1] = 2; @@ -83,11 +93,12 @@ class Jacobi : public ::testing::Test { |-1 4 -2 -1 | -1 4 */ - init_array(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13}); - init_array(mtx->get_col_idxs(), - {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4}); - init_array(mtx->get_values(), {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0, - -1.0, 4.0, -2.0, -1.0, -1.0, 4.0}); + init_array(mtx->get_row_ptrs(), {0, 3, 5, 7, 10, 13}); + init_array(mtx->get_col_idxs(), + {0, 1, 4, 0, 1, 2, 3, 2, 3, 4, 0, 3, 4}); + init_array(mtx->get_values(), + {4.0, -2.0, -2.0, -1.0, 4.0, 4.0, -2.0, -1.0, + 4.0, -2.0, -1.0, -1.0, 4.0}); } template @@ -99,20 +110,22 @@ class Jacobi : public ::testing::Test { } std::shared_ptr exec; - std::unique_ptr bj_factory; - std::unique_ptr adaptive_bj_factory; - gko::Array block_pointers; + std::unique_ptr bj_factory; + std::unique_ptr adaptive_bj_factory; + gko::Array block_pointers; gko::Array block_precisions; - std::shared_ptr> mtx; + std::shared_ptr> mtx; }; +TYPED_TEST_CASE(Jacobi, gko::test::ValueIndexTypes); -TEST_F(Jacobi, CanBeGenerated) + +TYPED_TEST(Jacobi, CanBeGenerated) { - auto bj = bj_factory->generate(mtx); + auto bj = this->bj_factory->generate(this->mtx); ASSERT_NE(bj, nullptr); - EXPECT_EQ(bj->get_executor(), exec); + EXPECT_EQ(bj->get_executor(), this->exec); EXPECT_EQ(bj->get_parameters().max_block_size, 3); ASSERT_EQ(bj->get_size(), gko::dim<2>(5, 5)); ASSERT_EQ(bj->get_num_blocks(), 2); @@ -123,11 +136,11 @@ TEST_F(Jacobi, CanBeGenerated) } -TEST_F(Jacobi, CanBeGeneratedWithAdaptivePrecision) +TYPED_TEST(Jacobi, CanBeGeneratedWithAdaptivePrecision) { - auto bj = adaptive_bj_factory->generate(mtx); + auto bj = this->adaptive_bj_factory->generate(this->mtx); - EXPECT_EQ(bj->get_executor(), exec); + EXPECT_EQ(bj->get_executor(), this->exec); EXPECT_EQ(bj->get_parameters().max_block_size, 17); ASSERT_EQ(bj->get_size(), gko::dim<2>(5, 5)); ASSERT_EQ(bj->get_num_blocks(), 2); @@ -142,7 +155,7 @@ TEST_F(Jacobi, CanBeGeneratedWithAdaptivePrecision) } -TEST_F(Jacobi, FindsNaturalBlocks) +TYPED_TEST(Jacobi, FindsNaturalBlocks) { /* example matrix: 1 1 @@ -150,11 +163,18 @@ TEST_F(Jacobi, FindsNaturalBlocks) 1 1 1 1 */ - auto mtx = Mtx::create(exec, gko::dim<2>{4}, 8); - init_array(mtx->get_row_ptrs(), {0, 2, 4, 6, 8}); - init_array(mtx->get_col_idxs(), {0, 1, 0, 1, 0, 2, 0, 2}); - init_array(mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); - auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(give(mtx)); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + auto mtx = Mtx::create(this->exec, gko::dim<2>{4}, 8); + this->template init_array(mtx->get_row_ptrs(), {0, 2, 4, 6, 8}); + this->template init_array(mtx->get_col_idxs(), + {0, 1, 0, 1, 0, 2, 0, 2}); + this->template init_array( + mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + auto bj = + Bj::build().with_max_block_size(3u).on(this->exec)->generate(give(mtx)); EXPECT_EQ(bj->get_parameters().max_block_size, 3); ASSERT_EQ(bj->get_num_blocks(), 2); @@ -165,7 +185,7 @@ TEST_F(Jacobi, FindsNaturalBlocks) } -TEST_F(Jacobi, ExecutesSupervariableAgglomeration) +TYPED_TEST(Jacobi, ExecutesSupervariableAgglomeration) { /* example matrix: 1 1 @@ -174,12 +194,19 @@ TEST_F(Jacobi, ExecutesSupervariableAgglomeration) 1 1 1 */ - auto mtx = Mtx::create(exec, gko::dim<2>{5}, 9); - init_array(mtx->get_row_ptrs(), {0, 2, 4, 6, 8, 9}); - init_array(mtx->get_col_idxs(), {0, 1, 0, 1, 2, 3, 2, 3, 4}); - init_array(mtx->get_values(), - {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); - auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(give(mtx)); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + auto mtx = Mtx::create(this->exec, gko::dim<2>{5}, 9); + this->template init_array(mtx->get_row_ptrs(), + {0, 2, 4, 6, 8, 9}); + this->template init_array(mtx->get_col_idxs(), + {0, 1, 0, 1, 2, 3, 2, 3, 4}); + this->template init_array( + mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + auto bj = + Bj::build().with_max_block_size(3u).on(this->exec)->generate(give(mtx)); EXPECT_EQ(bj->get_parameters().max_block_size, 3); ASSERT_EQ(bj->get_num_blocks(), 2); @@ -190,7 +217,7 @@ TEST_F(Jacobi, ExecutesSupervariableAgglomeration) } -TEST_F(Jacobi, AdheresToBlockSizeBound) +TYPED_TEST(Jacobi, AdheresToBlockSizeBound) { /* example matrix: 1 @@ -201,11 +228,19 @@ TEST_F(Jacobi, AdheresToBlockSizeBound) 1 1 */ - auto mtx = Mtx::create(exec, gko::dim<2>{7}, 7); - init_array(mtx->get_row_ptrs(), {0, 1, 2, 3, 4, 5, 6, 7}); - init_array(mtx->get_col_idxs(), {0, 1, 2, 3, 4, 5, 6}); - init_array(mtx->get_values(), {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); - auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(give(mtx)); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + using value_type = typename TestFixture::value_type; + auto mtx = Mtx::create(this->exec, gko::dim<2>{7}, 7); + this->template init_array(mtx->get_row_ptrs(), + {0, 1, 2, 3, 4, 5, 6, 7}); + this->template init_array(mtx->get_col_idxs(), + {0, 1, 2, 3, 4, 5, 6}); + this->template init_array(mtx->get_values(), + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + auto bj = + Bj::build().with_max_block_size(3u).on(this->exec)->generate(give(mtx)); EXPECT_EQ(bj->get_parameters().max_block_size, 3); ASSERT_EQ(bj->get_num_blocks(), 3); @@ -217,12 +252,14 @@ TEST_F(Jacobi, AdheresToBlockSizeBound) } -TEST_F(Jacobi, CanBeGeneratedWithUnknownBlockSizes) +TYPED_TEST(Jacobi, CanBeGeneratedWithUnknownBlockSizes) { - auto bj = Bj::build().with_max_block_size(3u).on(exec)->generate(mtx); + using Bj = typename TestFixture::Bj; + auto bj = + Bj::build().with_max_block_size(3u).on(this->exec)->generate(this->mtx); ASSERT_NE(bj, nullptr); - EXPECT_EQ(bj->get_executor(), exec); + EXPECT_EQ(bj->get_executor(), this->exec); EXPECT_EQ(bj->get_parameters().max_block_size, 3); ASSERT_EQ(bj->get_size(), gko::dim<2>(5, 5)); ASSERT_EQ(bj->get_num_blocks(), 2); @@ -233,397 +270,611 @@ TEST_F(Jacobi, CanBeGeneratedWithUnknownBlockSizes) } -TEST_F(Jacobi, InvertsDiagonalBlocks) +TYPED_TEST(Jacobi, InvertsDiagonalBlocks) { - auto bj = bj_factory->generate(mtx); + using T = typename TestFixture::value_type; + auto bj = this->bj_factory->generate(this->mtx); auto scheme = bj->get_storage_scheme(); auto p = scheme.get_stride(); auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0); - EXPECT_NEAR(b1[0 + 0 * p], 4.0 / 14.0, 1e-14); - EXPECT_NEAR(b1[0 + 1 * p], 2.0 / 14.0, 1e-14); - EXPECT_NEAR(b1[1 + 0 * p], 1.0 / 14.0, 1e-14); - EXPECT_NEAR(b1[1 + 1 * p], 4.0 / 14.0, 1e-14); + GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[0 + 1 * p], T{2.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 0 * p], T{1.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r::value); + + auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); +} + + +TYPED_TEST(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecision) +{ + using T = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto bj = this->adaptive_bj_factory->generate(this->mtx); + + auto scheme = bj->get_storage_scheme(); + auto p = scheme.get_stride(); + const auto b_prec_bj = + bj->get_parameters().storage_optimization.block_wise.get_const_data(); + using reduced = ::gko::reduce_precision; + auto b1 = reinterpret_cast( + bj->get_blocks() + scheme.get_global_block_offset(0)); + GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{4.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{2.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{1.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{4.0 / 14.0}, half_tol); + + auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); +} + +TYPED_TEST(Jacobi, CanTransposeDiagonalBlocks) +{ + using T = typename TestFixture::value_type; + using Bj = typename TestFixture::Bj; + auto tmp_bj = this->bj_factory->generate(this->mtx); + + auto bj = gko::as(tmp_bj->transpose()); + + auto scheme = bj->get_storage_scheme(); + auto p = scheme.get_stride(); + auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0); + GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 0 * p], T{2.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[0 + 1 * p], T{1.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r::value); auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); - EXPECT_NEAR(b2[0 + 0 * p], 14.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[0 + 1 * p], 8.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[0 + 2 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 0 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 1 * p], 16.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 2 * p], 8.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 0 * p], 1.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 1 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 2 * p], 14.0 / 48.0, 1e-14); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); } -TEST_F(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecision) + +TYPED_TEST(Jacobi, CanTransposeDiagonalBlocksWithAdaptivePrecision) { - auto bj = adaptive_bj_factory->generate(mtx); + using T = typename TestFixture::value_type; + using Bj = typename TestFixture::Bj; + auto half_tol = std::sqrt(r::value); + auto tmp_bj = this->adaptive_bj_factory->generate(this->mtx); + + auto bj = gko::as(tmp_bj->transpose()); auto scheme = bj->get_storage_scheme(); auto p = scheme.get_stride(); - auto b1 = reinterpret_cast( + using reduced = ::gko::reduce_precision; + auto b1 = reinterpret_cast( bj->get_blocks() + scheme.get_global_block_offset(0)); - EXPECT_NEAR(b1[0 + 0 * p], 4.0 / 14.0, 1e-7); - EXPECT_NEAR(b1[0 + 1 * p], 2.0 / 14.0, 1e-7); - EXPECT_NEAR(b1[1 + 0 * p], 1.0 / 14.0, 1e-7); - EXPECT_NEAR(b1[1 + 1 * p], 4.0 / 14.0, 1e-7); + GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{4.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{2.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{1.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{4.0 / 14.0}, half_tol); + auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); +} + + +TYPED_TEST(Jacobi, CanConjTransposeDiagonalBlocks) +{ + using T = typename TestFixture::value_type; + using Bj = typename TestFixture::Bj; + auto tmp_bj = this->bj_factory->generate(this->mtx); + + auto bj = gko::as(tmp_bj->conj_transpose()); + + auto scheme = bj->get_storage_scheme(); + auto p = scheme.get_stride(); + auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0); + GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 0 * p], T{2.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[0 + 1 * p], T{1.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r::value); + auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); +} + +TYPED_TEST(Jacobi, CanConjTransposeDiagonalBlocksWithAdaptivePrecision) +{ + using T = typename TestFixture::value_type; + using Bj = typename TestFixture::Bj; + auto half_tol = std::sqrt(r::value); + auto tmp_bj = this->adaptive_bj_factory->generate(this->mtx); + + auto bj = gko::as(tmp_bj->conj_transpose()); + + auto scheme = bj->get_storage_scheme(); + auto p = scheme.get_stride(); + using reduced = ::gko::reduce_precision; + auto b1 = reinterpret_cast( + bj->get_blocks() + scheme.get_global_block_offset(0)); + GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{4.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{2.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{1.0 / 14.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{4.0 / 14.0}, half_tol); auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); - EXPECT_NEAR(b2[0 + 0 * p], 14.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[0 + 1 * p], 8.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[0 + 2 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 0 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 1 * p], 16.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 2 * p], 8.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 0 * p], 1.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 1 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 2 * p], 14.0 / 48.0, 1e-14); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); } -TEST_F(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecisionAndSmallBlocks) +TYPED_TEST(Jacobi, InvertsDiagonalBlocksWithAdaptivePrecisionAndSmallBlocks) { + using Bj = typename TestFixture::Bj; + using T = typename TestFixture::value_type; auto bj = Bj::build() .with_max_block_size(3u) // group size will be > 1 - .with_block_pointers(block_pointers) - .with_storage_optimization(block_precisions) - .on(exec) - ->generate(mtx); + .with_block_pointers(this->block_pointers) + .with_storage_optimization(this->block_precisions) + .on(this->exec) + ->generate(this->mtx); auto scheme = bj->get_storage_scheme(); auto p = scheme.get_stride(); auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0); - EXPECT_NEAR(b1[0 + 0 * p], 4.0 / 14.0, 1e-14); - EXPECT_NEAR(b1[0 + 1 * p], 2.0 / 14.0, 1e-14); - EXPECT_NEAR(b1[1 + 0 * p], 1.0 / 14.0, 1e-14); - EXPECT_NEAR(b1[1 + 1 * p], 4.0 / 14.0, 1e-14); + GKO_EXPECT_NEAR(b1[0 + 0 * p], T{4.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[0 + 1 * p], T{2.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 0 * p], T{1.0 / 14.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 1 * p], T{4.0 / 14.0}, r::value); auto b2 = bj->get_blocks() + scheme.get_global_block_offset(1); - EXPECT_NEAR(b2[0 + 0 * p], 14.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[0 + 1 * p], 8.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[0 + 2 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 0 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 1 * p], 16.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[1 + 2 * p], 8.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 0 * p], 1.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 1 * p], 4.0 / 48.0, 1e-14); - EXPECT_NEAR(b2[2 + 2 * p], 14.0 / 48.0, 1e-14); + GKO_EXPECT_NEAR(b2[0 + 0 * p], T{14.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 1 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[0 + 2 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 0 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 1 * p], T{16.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[1 + 2 * p], T{8.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 0 * p], T{1.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 1 * p], T{4.0 / 48.0}, r::value); + GKO_EXPECT_NEAR(b2[2 + 2 * p], T{14.0 / 48.0}, r::value); } -TEST_F(Jacobi, PivotsWhenInvertingBlocks) +TYPED_TEST(Jacobi, PivotsWhenInvertingBlocks) { - gko::Array bp(exec, 2); - init_array(bp.get_data(), {0, 3}); - auto mtx = Mtx::create(exec, gko::dim<2>{3}, 9); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + gko::Array bp(this->exec, 2); + this->template init_array(bp.get_data(), {0, 3}); + auto mtx = Mtx::create(this->exec, gko::dim<2>{3}, 9); /* test matrix: 0 2 0 0 0 4 1 0 0 */ - init_array(mtx->get_row_ptrs(), {0, 3, 6, 9}); - init_array(mtx->get_col_idxs(), {0, 1, 2, 0, 1, 2, 0, 1, 2}); - init_array(mtx->get_values(), - {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0}); + this->template init_array(mtx->get_row_ptrs(), {0, 3, 6, 9}); + this->template init_array(mtx->get_col_idxs(), + {0, 1, 2, 0, 1, 2, 0, 1, 2}); + this->template init_array(mtx->get_values(), + {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0}); auto bj = Bj::build() .with_max_block_size(3u) .with_block_pointers(bp) - .on(exec) + .on(this->exec) ->generate(give(mtx)); auto scheme = bj->get_storage_scheme(); auto p = scheme.get_stride(); auto b1 = bj->get_blocks() + scheme.get_global_block_offset(0); - EXPECT_NEAR(b1[0 + 0 * p], 0.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[0 + 1 * p], 0.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[0 + 2 * p], 4.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[1 + 0 * p], 2.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[1 + 1 * p], 0.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[1 + 2 * p], 0.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[2 + 0 * p], 0.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[2 + 1 * p], 1.0 / 4.0, 1e-14); - EXPECT_NEAR(b1[2 + 2 * p], 0.0 / 4.0, 1e-14); + GKO_EXPECT_NEAR(b1[0 + 0 * p], T{0.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[0 + 1 * p], T{0.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[0 + 2 * p], T{4.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 0 * p], T{2.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 1 * p], T{0.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[1 + 2 * p], T{0.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[2 + 0 * p], T{0.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[2 + 1 * p], T{1.0 / 4.0}, r::value); + GKO_EXPECT_NEAR(b1[2 + 2 * p], T{0.0 / 4.0}, r::value); } -TEST_F(Jacobi, PivotsWhenInvertingBlocksWithiAdaptivePrecision) +TYPED_TEST(Jacobi, PivotsWhenInvertingBlocksWithiAdaptivePrecision) { - gko::Array bp(exec, 2); - init_array(bp.get_data(), {0, 3}); - auto mtx = Mtx::create(exec, gko::dim<2>{3}, 9); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + using T = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + gko::Array bp(this->exec, 2); + this->template init_array(bp.get_data(), {0, 3}); + auto mtx = Mtx::create(this->exec, gko::dim<2>{3}, 9); /* test matrix: 0 2 0 0 0 4 1 0 0 */ - init_array(mtx->get_row_ptrs(), {0, 3, 6, 9}); - init_array(mtx->get_col_idxs(), {0, 1, 2, 0, 1, 2, 0, 1, 2}); - init_array(mtx->get_values(), - {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0}); + this->template init_array(mtx->get_row_ptrs(), {0, 3, 6, 9}); + this->template init_array(mtx->get_col_idxs(), + {0, 1, 2, 0, 1, 2, 0, 1, 2}); + this->template init_array(mtx->get_values(), + {0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0}); auto bj = Bj::build() .with_max_block_size(3u) .with_block_pointers(bp) - .with_storage_optimization(block_precisions) - .on(exec) + .with_storage_optimization(this->block_precisions) + .on(this->exec) ->generate(give(mtx)); auto scheme = bj->get_storage_scheme(); auto p = scheme.get_stride(); - auto b1 = reinterpret_cast( + using reduced = ::gko::reduce_precision; + auto b1 = reinterpret_cast( bj->get_blocks() + scheme.get_global_block_offset(0)); - EXPECT_NEAR(b1[0 + 0 * p], 0.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[0 + 1 * p], 0.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[0 + 2 * p], 4.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[1 + 0 * p], 2.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[1 + 1 * p], 0.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[1 + 2 * p], 0.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[2 + 0 * p], 0.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[2 + 1 * p], 1.0 / 4.0, 1e-7); - EXPECT_NEAR(b1[2 + 2 * p], 0.0 / 4.0, 1e-7); + GKO_EXPECT_NEAR(b1[0 + 0 * p], reduced{0.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[0 + 1 * p], reduced{0.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[0 + 2 * p], reduced{4.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 0 * p], reduced{2.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 1 * p], reduced{0.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[1 + 2 * p], reduced{0.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[2 + 0 * p], reduced{0.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[2 + 1 * p], reduced{1.0 / 4.0}, half_tol); + GKO_EXPECT_NEAR(b1[2 + 2 * p], reduced{0.0 / 4.0}, half_tol); } -TEST_F(Jacobi, ComputesConditionNumbersOfBlocks) +TYPED_TEST(Jacobi, ComputesConditionNumbersOfBlocks) { - auto bj = adaptive_bj_factory->generate(mtx); + using T = typename TestFixture::value_type; + auto bj = this->adaptive_bj_factory->generate(this->mtx); auto cond = bj->get_conditioning(); - EXPECT_NEAR(cond[0], 6.0 * 6.0 / 14.0, 1e-14); - ASSERT_NEAR(cond[1], 7.0 * 28.0 / 48.0, 1e-14); + GKO_EXPECT_NEAR(cond[0], gko::remove_complex{6.0 * 6.0 / 14.0}, + r::value * 1e1); + GKO_ASSERT_NEAR(cond[1], gko::remove_complex{7.0 * 28.0 / 48.0}, + r::value * 1e1); } -TEST_F(Jacobi, SelectsCorrectBlockPrecisions) +TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions) { + using Bj = typename TestFixture::Bj; + using T = typename TestFixture::value_type; auto bj = Bj::build() .with_max_block_size(17u) - .with_block_pointers(block_pointers) + .with_block_pointers(this->block_pointers) .with_storage_optimization(gko::precision_reduction::autodetect()) - .with_accuracy(1.5e-3) - .on(exec) - ->generate(give(mtx)); + .with_accuracy(gko::remove_complex{1.5e-3}) + .on(this->exec) + ->generate(give(this->mtx)); auto prec = bj->get_parameters().storage_optimization.block_wise.get_const_data(); + auto precision2 = std::is_same, float>::value + ? gko::precision_reduction(0, 0) // float + : gko::precision_reduction(0, 1); // double EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); // u * cond = ~1.2e-3 - ASSERT_EQ(prec[1], gko::precision_reduction(0, 1)); // u * cond = ~2.0e-3 + ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 } -TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) +TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow) { - auto mtx = gko::matrix::Csr<>::create(exec); + using Bj = typename TestFixture::Bj; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + using mdata = typename TestFixture::mdata; + using T = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto mtx = Mtx::create(this->exec); // clang-format off mtx->read(mdata::diag({ - // perfectly conditioned block, small value difference, - // can use fp16 (5, 10) - {{2.0, 1.0}, - {1.0, 2.0}}, - // perfectly conditioned block (scaled orthogonal), - // with large value difference, need fp16 (7, 8) - {{1e-7, -1e-14}, - {1e-14, 1e-7}} + // perfectly conditioned block, small value difference, + // can use fp16 (5, 10) + {{2.0, 1.0}, + {1.0, 2.0}}, + // perfectly conditioned block (scaled orthogonal), + // with large value difference, need fp16 (7, 8) + {{half_tol, -r::value}, + {r::value, half_tol}} })); // clang-format on auto bj = Bj::build() .with_max_block_size(13u) - .with_block_pointers(gko::Array(exec, {0, 2, 4})) + .with_block_pointers(gko::Array(this->exec, {0, 2, 4})) .with_storage_optimization(gko::precision_reduction::autodetect()) - .with_accuracy(0.1) - .on(exec) + .with_accuracy(gko::remove_complex{1e-1}) + .on(this->exec) ->generate(give(mtx)); // both blocks are in the same group, both need (7, 8) auto prec = bj->get_parameters().storage_optimization.block_wise.get_const_data(); - EXPECT_EQ(prec[0], gko::precision_reduction(1, 1)); - ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); + auto precision = std::is_same, float>::value + ? gko::precision_reduction(0, 2) // float + : gko::precision_reduction(1, 1); // double + EXPECT_EQ(prec[0], precision); + ASSERT_EQ(prec[1], precision); } -TEST_F(Jacobi, AppliesToVector) +TYPED_TEST(Jacobi, AppliesToVector) { - auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, exec); - auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, exec); - auto bj = bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); + auto bj = this->bj_factory->generate(this->mtx); bj->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), r::value); } -TEST_F(Jacobi, AppliesToVectorWithAdaptivePrecision) +TYPED_TEST(Jacobi, AppliesToVectorWithAdaptivePrecision) { - auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, exec); - auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, exec); - auto bj = adaptive_bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); + auto bj = this->adaptive_bj_factory->generate(this->mtx); bj->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), 1e-7); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), half_tol); } -TEST_F(Jacobi, AppliesToVectorWithAdaptivePrecisionAndSmallBlocks) +TYPED_TEST(Jacobi, AppliesToVectorWithAdaptivePrecisionAndSmallBlocks) { - auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, exec); - auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, exec); + using Bj = typename TestFixture::Bj; + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); auto bj = Bj::build() .with_max_block_size(3u) // group size will be > 1 - .with_block_pointers(block_pointers) - .with_storage_optimization(block_precisions) - .on(exec) - ->generate(mtx); + .with_block_pointers(this->block_pointers) + .with_storage_optimization(this->block_precisions) + .on(this->exec) + ->generate(this->mtx); bj->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), 1e-7); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.0, 0.0, 1.0, 0.0}), half_tol); } -TEST_F(Jacobi, AppliesToMultipleVectors) +TYPED_TEST(Jacobi, AppliesToMultipleVectors) { - auto x = gko::initialize( - 3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}}, - exec); - auto b = gko::initialize( - 3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}}, - exec); - auto bj = bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); + auto bj = this->bj_factory->generate(this->mtx); bj->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR( x, l({{1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}}), - 1e-14); + r::value); } -TEST_F(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecision) +TYPED_TEST(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecision) { - auto x = gko::initialize( - 3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}}, - exec); - auto b = gko::initialize( - 3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}}, - exec); - auto bj = adaptive_bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); + auto bj = this->adaptive_bj_factory->generate(this->mtx); bj->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR( x, l({{1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}}), - 1e-7); + half_tol); } -TEST_F(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecisionAndSmallBlocks) +TYPED_TEST(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecisionAndSmallBlocks) { - auto x = gko::initialize( - 3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}}, - exec); - auto b = gko::initialize( - 3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}}, - exec); + using Vec = typename TestFixture::Vec; + using Bj = typename TestFixture::Bj; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); auto bj = Bj::build() .with_max_block_size(3u) // group size will be > 1 - .with_block_pointers(block_pointers) - .with_storage_optimization(block_precisions) - .on(exec) - ->generate(mtx); + .with_block_pointers(this->block_pointers) + .with_storage_optimization(this->block_precisions) + .on(this->exec) + ->generate(this->mtx); bj->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR( x, l({{1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}}), - 1e-7); + half_tol); } -TEST_F(Jacobi, AppliesLinearCombinationToVector) +TYPED_TEST(Jacobi, AppliesLinearCombinationToVector) { - auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, exec); - auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, exec); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto bj = bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto bj = this->bj_factory->generate(this->mtx); bj->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), + r::value); } -TEST_F(Jacobi, AppliesLinearCombinationToVectorWithAdaptivePrecision) +TYPED_TEST(Jacobi, AppliesLinearCombinationToVectorWithAdaptivePrecision) { - auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, exec); - auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, exec); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto bj = adaptive_bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto bj = this->adaptive_bj_factory->generate(this->mtx); bj->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), 1e-7); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 1.0, -2.0, 4.0, -3.0}), half_tol); } -TEST_F(Jacobi, AppliesLinearCombinationToMultipleVectors) +TYPED_TEST(Jacobi, AppliesLinearCombinationToMultipleVectors) { - auto x = gko::initialize( - 3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}}, - exec); - auto b = gko::initialize( - 3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}}, - exec); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto bj = bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto bj = this->bj_factory->generate(this->mtx); bj->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR( x, l({{1.0, -0.5}, {1.0, 2.5}, {-2.0, -1.0}, {4.0, 1.0}, {-3.0, 0.5}}), - 1e-14); + r::value); } -TEST_F(Jacobi, AppliesLinearCombinationToMultipleVectorsWithAdaptivePrecision) +TYPED_TEST(Jacobi, + AppliesLinearCombinationToMultipleVectorsWithAdaptivePrecision) { - auto x = gko::initialize( - 3, {{1.0, 0.5}, {-1.0, -0.5}, {2.0, 1.0}, {-2.0, -1.0}, {3.0, 1.5}}, - exec); - auto b = gko::initialize( - 3, {{4.0, -2.0}, {-1.0, 4.0}, {-2.0, 0.0}, {4.0, -2.0}, {-1.0, 4.0}}, - exec); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto bj = adaptive_bj_factory->generate(mtx); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto bj = this->adaptive_bj_factory->generate(this->mtx); bj->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR( x, l({{1.0, -0.5}, {1.0, 2.5}, {-2.0, -1.0}, {4.0, 1.0}, {-3.0, 0.5}}), - 1e-7); + half_tol); } -TEST_F(Jacobi, ConvertsToDense) +TYPED_TEST(Jacobi, ConvertsToDense) { - auto dense = gko::matrix::Dense<>::create(exec); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto dense = Vec::create(this->exec); - dense->copy_from(bj_factory->generate(mtx)); + dense->copy_from(this->bj_factory->generate(this->mtx)); // clang-format off GKO_ASSERT_MTX_NEAR(dense, @@ -631,16 +882,19 @@ TEST_F(Jacobi, ConvertsToDense) {1.0 / 14, 4.0 / 14, 0.0, 0.0, 0.0}, { 0.0, 0.0, 14.0 / 48, 8.0 / 48, 4.0 / 48}, { 0.0, 0.0, 4.0 / 48, 16.0 / 48, 8.0 / 48}, - { 0.0, 0.0, 1.0 / 48, 4.0 / 48, 14.0 / 48}}), 1e-14); + { 0.0, 0.0, 1.0 / 48, 4.0 / 48, 14.0 / 48}}), r::value); // clang-format on } -TEST_F(Jacobi, ConvertsToDenseWithAdaptivePrecision) +TYPED_TEST(Jacobi, ConvertsToDenseWithAdaptivePrecision) { - auto dense = gko::matrix::Dense<>::create(exec); + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto dense = Vec::create(this->exec); - dense->copy_from(adaptive_bj_factory->generate(mtx)); + dense->copy_from(this->adaptive_bj_factory->generate(this->mtx)); // clang-format off GKO_ASSERT_MTX_NEAR(dense, @@ -648,9 +902,22 @@ TEST_F(Jacobi, ConvertsToDenseWithAdaptivePrecision) {1.0 / 14, 4.0 / 14, 0.0, 0.0, 0.0}, { 0.0, 0.0, 14.0 / 48, 8.0 / 48, 4.0 / 48}, { 0.0, 0.0, 4.0 / 48, 16.0 / 48, 8.0 / 48}, - { 0.0, 0.0, 1.0 / 48, 4.0 / 48, 14.0 / 48}}), 1e-7); + { 0.0, 0.0, 1.0 / 48, 4.0 / 48, 14.0 / 48}}), half_tol); // clang-format on } +TYPED_TEST(Jacobi, ConvertsEmptyToDense) +{ + using Vec = typename TestFixture::Vec; + auto empty = Vec::create(this->exec); + auto res = Vec::create(this->exec); + + res->copy_from( + TestFixture::Bj::build().on(this->exec)->generate(gko::share(empty))); + + ASSERT_FALSE(res->get_size()); +} + + } // namespace diff --git a/reference/test/solver/CMakeLists.txt b/reference/test/solver/CMakeLists.txt index 86a6bbf576b..c86beb38b5c 100644 --- a/reference/test/solver/CMakeLists.txt +++ b/reference/test/solver/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_test(bicg_kernels) ginkgo_create_test(bicgstab_kernels) ginkgo_create_test(cg_kernels) ginkgo_create_test(cgs_kernels) diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp new file mode 100644 index 00000000000..e1d5b35da65 --- /dev/null +++ b/reference/test/solver/bicg_kernels.cpp @@ -0,0 +1,338 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class Bicg : public ::testing::Test { +protected: + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Bicg; + Bicg() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::initialize( + {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), + bicg_factory( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(4u).on(exec), + gko::stop::Time::build() + .with_time_limit(std::chrono::seconds(6)) + .on(exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(exec)) + .on(exec)), + mtx_big(gko::initialize( + {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0}, + {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5}, + {4150.0, 1805.0, 6472.5, 2656.0, 2409.5, 3836.5}, + {-3139.5, 73.0, 2656.0, 6048.0, 665.0, -132.0}, + {3829.5, 1966.0, 2409.5, 665.0, 4240.5, 4373.5}, + {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}}, + exec)), + bicg_factory_big( + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(100u).on( + exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(exec)) + .on(exec)), + mtx_non_symmetric(gko::initialize( + {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)) + + + {} + + std::shared_ptr exec; + std::shared_ptr mtx; + std::shared_ptr mtx_big; + std::shared_ptr mtx_non_symmetric; + std::unique_ptr bicg_factory; + std::unique_ptr bicg_factory_big; + std::unique_ptr bicg_factory_non_symmetric; +}; + +TYPED_TEST_CASE(Bicg, gko::test::ValueTypes); + + +TYPED_TEST(Bicg, SolvesStencilSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->bicg_factory->generate(this->mtx); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value); +} + + +TYPED_TEST(Bicg, SolvesMultipleStencilSystems) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->bicg_factory->generate(this->mtx); + auto b = gko::initialize( + {I{-1.0, 1.0}, I{3.0, 0.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); + + solver->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), + r::value); +} + + +TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->bicg_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); + + solver->apply(alpha.get(), b.get(), beta.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r::value); +} + + +TYPED_TEST(Bicg, SolvesMultipleStencilSystemsUsingAdvancedApply) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->bicg_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{-1.0, 1.0}, I{3.0, 0.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); + + solver->apply(alpha.get(), b.get(), beta.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), + r::value * 1e1); +} + + +TYPED_TEST(Bicg, SolvesBigDenseSystem1) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->bicg_factory_big->generate(this->mtx_big); + auto b = gko::initialize( + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e2); +} + + +TYPED_TEST(Bicg, SolvesBigDenseSystem2) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->bicg_factory_big->generate(this->mtx_big); + auto b = gko::initialize( + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), + r::value * 1e2); +} + + +TYPED_TEST(Bicg, SolvesNonSymmetricStencilSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->bicg_factory->generate(this->mtx_non_symmetric); + auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e2); +} + + +template +gko::remove_complex infNorm(gko::matrix::Dense *mat, size_t col = 0) +{ + using std::abs; + using no_cpx_t = gko::remove_complex; + no_cpx_t norm = 0.0; + for (size_t i = 0; i < mat->get_size()[0]; ++i) { + no_cpx_t absEntry = abs(mat->at(i, col)); + if (norm < absEntry) norm = absEntry; + } + return norm; +} + + +TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->bicg_factory_big->generate(this->mtx_big); + auto b1 = gko::initialize( + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto b2 = gko::initialize( + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); + + auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + auto bc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2}); + auto xc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2}); + for (size_t i = 0; i < bc->get_size()[0]; ++i) { + bc->at(i, 0) = b1->at(i); + bc->at(i, 1) = b2->at(i); + + xc->at(i, 0) = x1->at(i); + xc->at(i, 1) = x2->at(i); + } + + solver->apply(b1.get(), x1.get()); + solver->apply(b2.get(), x2.get()); + solver->apply(bc.get(), xc.get()); + auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2}); + for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) { + mergedRes->at(i, 0) = x1->at(i); + mergedRes->at(i, 1) = x2->at(i); + } + + auto alpha = gko::initialize({1.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + + auto residual1 = Mtx::create(this->exec, b1->get_size()); + residual1->copy_from(b1.get()); + auto residual2 = Mtx::create(this->exec, b2->get_size()); + residual2->copy_from(b2.get()); + auto residualC = Mtx::create(this->exec, bc->get_size()); + residualC->copy_from(bc.get()); + + this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); + this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); + this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); + + auto normS1 = infNorm(residual1.get()); + auto normS2 = infNorm(residual2.get()); + auto normC1 = infNorm(residualC.get(), 0); + auto normC2 = infNorm(residualC.get(), 1); + auto normB1 = infNorm(b1.get()); + auto normB2 = infNorm(b2.get()); + + // make sure that all combined solutions are as good or better than the + // single solutions + ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value); + ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value); + + // Not sure if this is necessary, the assertions above should cover what is + // needed. + GKO_ASSERT_MTX_NEAR(xc, mergedRes, r::value); +} + + +TYPED_TEST(Bicg, SolvesTransposedNonSymmetricStencilSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = + this->bicg_factory->generate(this->mtx_non_symmetric->transpose()); + auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e2); +} + + +TYPED_TEST(Bicg, SolvesConjTransposedNonSymmetricStencilSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = + this->bicg_factory->generate(this->mtx_non_symmetric->conj_transpose()); + auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e2); +} + + +} // namespace diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index c6ceb0b88ba..45836e2ac01 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,23 +36,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include #include -#include +#include #include +#include "core/test/utils.hpp" + + namespace { +template class Bicgstab : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Bicgstab<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Bicgstab; Bicgstab() : exec(gko::ReferenceExecutor::create()), @@ -65,91 +69,113 @@ class Bicgstab : public ::testing::Test { gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), bicgstab_factory_precision( - gko::solver::Bicgstab<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(50u).on( exec), gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)) {} std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr::Factory> bicgstab_factory; - std::unique_ptr::Factory> - bicgstab_factory_precision; + std::unique_ptr bicgstab_factory; + std::unique_ptr bicgstab_factory_precision; }; +TYPED_TEST_CASE(Bicgstab, gko::test::ValueTypes); -TEST_F(Bicgstab, SolvesDenseSystem) + +TYPED_TEST(Bicgstab, SolvesDenseSystem) { - auto solver = bicgstab_factory->generate(mtx); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->bicgstab_factory->generate(this->mtx); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol); } -TEST_F(Bicgstab, SolvesMultipleDenseSystems) +TYPED_TEST(Bicgstab, SolvesMultipleDenseSystems) { - auto solver = bicgstab_factory->generate(mtx); - auto b = - gko::initialize({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->bicgstab_factory->generate(this->mtx); + auto b = gko::initialize( + {I{-1.0, -5.0}, I{3.0, 1.0}, I{1.0, -2.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), + half_tol); } -TEST_F(Bicgstab, SolvesDenseSystemUsingAdvancedApply) +TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) { - auto solver = bicgstab_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.5, 1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->bicgstab_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), half_tol); } -TEST_F(Bicgstab, SolvesMultipleDenseSystemsUsingAdvancedApply) +TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsUsingAdvancedApply) { - auto solver = bicgstab_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = - gko::initialize({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec); - auto x = gko::initialize({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->bicgstab_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{-1.0, -5.0}, I{3.0, 1.0}, I{1.0, -2.0}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), + half_tol); } // The following test-data was generated and validated with MATLAB -TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) +TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) { + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, @@ -157,10 +183,11 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) {60.0, -86.0, 54.0, -40.0, -93.0, 56.0}, {53.0, 94.0, -54.0, 86.0, -61.0, 4.0}, {-42.0, 57.0, 32.0, 89.0, 89.0, -39.0}}, - exec); - auto solver = bicgstab_factory_precision->generate(locmtx); - auto b = gko::initialize({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + this->exec); + auto solver = this->bicgstab_factory_precision->generate(locmtx); + auto b = + gko::initialize({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); @@ -168,12 +195,15 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) x, l({0.13853406350816114, -0.08147485210505287, -0.0450299311807042, -0.0051264177562865719, 0.11609654300797841, 0.1018688746740561}), - 1e-9); + half_tol * 5e-1); } -TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) +TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) { + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, @@ -181,10 +211,11 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) {60.0, -86.0, 54.0, -40.0, -93.0, 56.0}, {53.0, 94.0, -54.0, 86.0, -61.0, 4.0}, {-42.0, 57.0, 32.0, 89.0, 89.0, -39.0}}, - exec); - auto solver = bicgstab_factory_precision->generate(locmtx); - auto b = gko::initialize({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + this->exec); + auto solver = this->bicgstab_factory_precision->generate(locmtx); + auto b = + gko::initialize({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); @@ -192,24 +223,29 @@ TEST_F(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) x, l({0.13517641417299162, 0.75117689075221139, 0.47572853185155239, -0.50927993095367852, 0.13463333820848167, 0.23126768306576015}), - 1e-9); + half_tol * 1e-1); } -double infNorm(gko::matrix::Dense<> *mat, size_t col = 0) +template +gko::remove_complex infNorm(gko::matrix::Dense *mat, size_t col = 0) { using std::abs; - double norm = 0.0; + using no_cpx_t = gko::remove_complex; + no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { - double absEntry = abs(mat->at(i, col)); + no_cpx_t absEntry = abs(mat->at(i, col)); if (norm < absEntry) norm = absEntry; } return norm; } -TEST_F(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) +TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) { + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, @@ -217,16 +253,20 @@ TEST_F(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) {60.0, -86.0, 54.0, -40.0, -93.0, 56.0}, {53.0, 94.0, -54.0, 86.0, -61.0, 4.0}, {-42.0, 57.0, 32.0, 89.0, 89.0, -39.0}}, - exec); - auto solver = bicgstab_factory_precision->generate(locmtx); - auto b1 = gko::initialize({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, exec); - auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto b2 = gko::initialize({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, exec); - auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto bc = gko::initialize( - {{0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, exec); - auto xc = gko::initialize( - {{0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, exec); + this->exec); + auto solver = this->bicgstab_factory_precision->generate(locmtx); + auto b1 = + gko::initialize({0.0, -9.0, -2.0, 8.0, -5.0, -6.0}, this->exec); + auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto b2 = + gko::initialize({9.0, -4.0, -6.0, -10.0, 1.0, 10.0}, this->exec); + auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto bc = gko::initialize({I{0., 0.}, I{0., 0.}, I{0., 0.}, + I{0., 0.}, I{0., 0.}, I{0., 0.}}, + this->exec); + auto xc = gko::initialize({I{0., 0.}, I{0., 0.}, I{0., 0.}, + I{0., 0.}, I{0., 0.}, I{0., 0.}}, + this->exec); for (size_t i = 0; i < xc->get_size()[0]; ++i) { bc->at(i, 0) = b1->at(i); bc->at(i, 1) = b2->at(i); @@ -237,42 +277,74 @@ TEST_F(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) solver->apply(b1.get(), x1.get()); solver->apply(b2.get(), x2.get()); solver->apply(bc.get(), xc.get()); - auto testMtx = gko::initialize( - {{0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, exec); + auto testMtx = + gko::initialize({I{0., 0.}, I{0., 0.}, I{0., 0.}, + I{0., 0.}, I{0., 0.}, I{0., 0.}}, + this->exec); for (size_t i = 0; i < testMtx->get_size()[0]; ++i) { testMtx->at(i, 0) = x1->at(i); testMtx->at(i, 1) = x2->at(i); } - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto residual1 = gko::initialize({0.}, exec); + auto alpha = gko::initialize({1.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto residual1 = gko::initialize({0.}, this->exec); residual1->copy_from(b1->clone()); - auto residual2 = gko::initialize({0.}, exec); + auto residual2 = gko::initialize({0.}, this->exec); residual2->copy_from(b2->clone()); - auto residualC = gko::initialize({0.}, exec); + auto residualC = gko::initialize({0.}, this->exec); residualC->copy_from(bc->clone()); locmtx->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); locmtx->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); locmtx->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); - double normS1 = infNorm(residual1.get()); - double normS2 = infNorm(residual2.get()); - double normC1 = infNorm(residualC.get(), 0); - double normC2 = infNorm(residualC.get(), 1); - double normB1 = infNorm(bc.get(), 0); - double normB2 = infNorm(bc.get(), 1); + auto normS1 = infNorm(residual1.get()); + auto normS2 = infNorm(residual2.get()); + auto normC1 = infNorm(residualC.get(), 0); + auto normC2 = infNorm(residualC.get(), 1); + auto normB1 = infNorm(bc.get(), 0); + auto normB2 = infNorm(bc.get(), 1); // make sure that all combined solutions are as good or better than the // single solutions - ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-12); - ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-12); + ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value * 1e2); + ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value * 1e2); // Not sure if this is necessary, the assertions above should cover what is // needed. - GKO_ASSERT_MTX_NEAR(xc, testMtx, 1e-14); + GKO_ASSERT_MTX_NEAR(xc, testMtx, r::value); +} + + +TYPED_TEST(Bicgstab, SolvesTransposedDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->bicgstab_factory->generate(this->mtx->transpose()); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol); +} + + +TYPED_TEST(Bicgstab, SolvesConjTransposedDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->bicgstab_factory->generate(this->mtx->conj_transpose()); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol); } diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 349e83b93db..adeea72b0ae 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,35 +36,40 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include #include -#include +#include #include +#include "core/test/utils.hpp" + + namespace { +template class Cg : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Cg; Cg() : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), cg_factory( - gko::solver::Cg<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(4u).on(exec), gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), mtx_big(gko::initialize( @@ -76,12 +81,12 @@ class Cg : public ::testing::Test { {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}}, exec)), cg_factory_big( - gko::solver::Cg<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(100u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)) {} @@ -89,114 +94,148 @@ class Cg : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; std::shared_ptr mtx_big; - std::unique_ptr::Factory> cg_factory; - std::unique_ptr::Factory> cg_factory_big; + std::unique_ptr cg_factory; + std::unique_ptr cg_factory_big; }; +TYPED_TEST_CASE(Cg, gko::test::ValueTypes); + -TEST_F(Cg, SolvesStencilSystem) +TYPED_TEST(Cg, SolvesStencilSystem) { - auto solver = cg_factory->generate(mtx); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory->generate(this->mtx); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value); } -TEST_F(Cg, SolvesMultipleStencilSystems) +TYPED_TEST(Cg, SolvesMultipleStencilSystems) { - auto solver = cg_factory->generate(mtx); - auto b = gko::initialize({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->cg_factory->generate(this->mtx); + auto b = gko::initialize( + {I{-1.0, 1.0}, I{3.0, 0.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), + r::value); } -TEST_F(Cg, SolvesStencilSystemUsingAdvancedApply) +TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply) { - auto solver = cg_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.5, 1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r::value); } -TEST_F(Cg, SolvesMultipleStencilSystemsUsingAdvancedApply) +TYPED_TEST(Cg, SolvesMultipleStencilSystemsUsingAdvancedApply) { - auto solver = cg_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec); - auto x = gko::initialize({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->cg_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{-1.0, 1.0}, I{3.0, 0.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), + r::value * 1e1); } -TEST_F(Cg, SolvesBigDenseSystem1) +TYPED_TEST(Cg, SolvesBigDenseSystem1) { - auto solver = cg_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e2); } -TEST_F(Cg, SolvesBigDenseSystem2) +TYPED_TEST(Cg, SolvesBigDenseSystem2) { - auto solver = cg_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), + r::value * 1e2); } -double infNorm(gko::matrix::Dense<> *mat, size_t col = 0) +template +gko::remove_complex infNorm(gko::matrix::Dense *mat, size_t col = 0) { using std::abs; - double norm = 0.0; + using no_cpx_t = gko::remove_complex; + no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { - double absEntry = abs(mat->at(i, col)); + no_cpx_t absEntry = abs(mat->at(i, col)); if (norm < absEntry) norm = absEntry; } return norm; } -TEST_F(Cg, SolvesMultipleDenseSystemForDivergenceCheck) +TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck) { - auto solver = cg_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( - {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec); + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); auto b2 = gko::initialize( - {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec); + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); - auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); - auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2}); - auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2}); + auto bc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2}); + auto xc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2}); for (size_t i = 0; i < bc->get_size()[0]; ++i) { bc->at(i, 0) = b1->at(i); bc->at(i, 1) = b2->at(i); @@ -208,41 +247,75 @@ TEST_F(Cg, SolvesMultipleDenseSystemForDivergenceCheck) solver->apply(b1.get(), x1.get()); solver->apply(b2.get(), x2.get()); solver->apply(bc.get(), xc.get()); - auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2}); + auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2}); for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) { mergedRes->at(i, 0) = x1->at(i); mergedRes->at(i, 1) = x2->at(i); } - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({-1.0}, exec); + auto alpha = gko::initialize({1.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); - auto residual1 = Mtx::create(exec, b1->get_size()); + auto residual1 = Mtx::create(this->exec, b1->get_size()); residual1->copy_from(b1.get()); - auto residual2 = Mtx::create(exec, b2->get_size()); + auto residual2 = Mtx::create(this->exec, b2->get_size()); residual2->copy_from(b2.get()); - auto residualC = Mtx::create(exec, bc->get_size()); + auto residualC = Mtx::create(this->exec, bc->get_size()); residualC->copy_from(bc.get()); - mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); - mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); - mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); + this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); + this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); + this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); - double normS1 = infNorm(residual1.get()); - double normS2 = infNorm(residual2.get()); - double normC1 = infNorm(residualC.get(), 0); - double normC2 = infNorm(residualC.get(), 1); - double normB1 = infNorm(b1.get()); - double normB2 = infNorm(b2.get()); + auto normS1 = infNorm(residual1.get()); + auto normS2 = infNorm(residual2.get()); + auto normC1 = infNorm(residualC.get(), 0); + auto normC2 = infNorm(residualC.get(), 1); + auto normB1 = infNorm(b1.get()); + auto normB2 = infNorm(b2.get()); // make sure that all combined solutions are as good or better than the // single solutions - ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14); - ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14); + ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value); + ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value); // Not sure if this is necessary, the assertions above should cover what is // needed. - GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14); + GKO_ASSERT_MTX_NEAR(xc, mergedRes, r::value); +} + + +TYPED_TEST(Cg, SolvesTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory_big->generate(this->mtx_big); + auto b = gko::initialize( + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e2); +} + + +TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cg_factory_big->generate(this->mtx_big); + auto b = gko::initialize( + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e2); } diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 277d73d6d20..8a15055bea9 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,23 +36,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include #include #include -#include +#include #include +#include "core/test/utils.hpp" + + namespace { +template class Cgs : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Cgs<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Cgs; Cgs() : exec(gko::ReferenceExecutor::create()), @@ -63,8 +67,8 @@ class Cgs : public ::testing::Test { .with_criteria( gko::stop::Iteration::build().with_max_iters(40u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), mtx_big( @@ -76,12 +80,12 @@ class Cgs : public ::testing::Test { {69.0, 32.0, -68.0, 57.0, -30.0, -51.0}}, exec)), cgs_factory_big( - gko::solver::Cgs<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(100u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)) {} @@ -89,116 +93,148 @@ class Cgs : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; std::shared_ptr mtx_big; - std::unique_ptr::Factory> cgs_factory; - std::unique_ptr::Factory> cgs_factory_big; + std::unique_ptr cgs_factory; + std::unique_ptr cgs_factory_big; }; +TYPED_TEST_CASE(Cgs, gko::test::ValueTypes); -TEST_F(Cgs, SolvesDenseSystem) + +TYPED_TEST(Cgs, SolvesDenseSystem) { - auto solver = cgs_factory->generate(mtx); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->cgs_factory->generate(this->mtx); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), half_tol); } -TEST_F(Cgs, SolvesMultipleDenseSystem) +TYPED_TEST(Cgs, SolvesMultipleDenseSystem) { - auto solver = cgs_factory->generate(mtx); - auto b = - gko::initialize({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->cgs_factory->generate(this->mtx); + auto b = gko::initialize( + {I{-1.0, -5.0}, I{3.0, 1.0}, I{1.0, -2.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), + half_tol); } -TEST_F(Cgs, SolvesDenseSystemUsingAdvancedApply) +TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply) { - auto solver = cgs_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.5, 1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->cgs_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), half_tol); } -TEST_F(Cgs, SolvesMultipleDenseSystemsUsingAdvancedApply) +TYPED_TEST(Cgs, SolvesMultipleDenseSystemsUsingAdvancedApply) { - auto solver = cgs_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = - gko::initialize({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec); - auto x = gko::initialize({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto solver = this->cgs_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{-1.0, -5.0}, I{3.0, 1.0}, I{1.0, -2.0}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), 1e-8); + GKO_ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), + half_tol); } -TEST_F(Cgs, SolvesBigDenseSystem1) +TYPED_TEST(Cgs, SolvesBigDenseSystem1) { - auto solver = cgs_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}), + r::value * 1e3); } -TEST_F(Cgs, SolvesBigDenseSystem2) +TYPED_TEST(Cgs, SolvesBigDenseSystem2) { - auto solver = cgs_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-58.0, 98.0, -16.0, -58.0, 2.0, 76.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({-58.0, 98.0, -16.0, -58.0, 2.0, 76.0}), + r::value * 1e2); } -double infNorm(gko::matrix::Dense<> *mat, size_t col = 0) +template +gko::remove_complex infNorm(gko::matrix::Dense *mat, size_t col = 0) { using std::abs; - double norm = 0.0; + using no_cpx_t = gko::remove_complex; + no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { - double absEntry = abs(mat->at(i, col)); + no_cpx_t absEntry = abs(mat->at(i, col)); if (norm < absEntry) norm = absEntry; } return norm; } -TEST_F(Cgs, SolvesMultipleDenseSystems) +TYPED_TEST(Cgs, SolvesMultipleDenseSystems) { - auto solver = cgs_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( - {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, exec); + {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); auto b2 = gko::initialize( - {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, exec); + {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); - auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); - auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2}); - auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2}); + auto bc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2}); + auto xc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2}); for (size_t i = 0; i < bc->get_size()[0]; ++i) { bc->at(i, 0) = b1->at(i); bc->at(i, 1) = b2->at(i); @@ -210,41 +246,74 @@ TEST_F(Cgs, SolvesMultipleDenseSystems) solver->apply(b1.get(), x1.get()); solver->apply(b2.get(), x2.get()); solver->apply(bc.get(), xc.get()); - auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2}); + auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2}); for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) { mergedRes->at(i, 0) = x1->at(i); mergedRes->at(i, 1) = x2->at(i); } - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({-1.0}, exec); + auto alpha = gko::initialize({1.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); - auto residual1 = Mtx::create(exec, b1->get_size()); + auto residual1 = Mtx::create(this->exec, b1->get_size()); residual1->copy_from(b1.get()); - auto residual2 = Mtx::create(exec, b2->get_size()); + auto residual2 = Mtx::create(this->exec, b2->get_size()); residual2->copy_from(b2.get()); - auto residualC = Mtx::create(exec, bc->get_size()); + auto residualC = Mtx::create(this->exec, bc->get_size()); residualC->copy_from(bc.get()); - mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); - mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); - mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); + this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); + this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); + this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); - double normS1 = infNorm(residual1.get()); - double normS2 = infNorm(residual2.get()); - double normC1 = infNorm(residualC.get(), 0); - double normC2 = infNorm(residualC.get(), 1); - double normB1 = infNorm(b1.get()); - double normB2 = infNorm(b2.get()); + auto normS1 = infNorm(residual1.get()); + auto normS2 = infNorm(residual2.get()); + auto normC1 = infNorm(residualC.get(), 0); + auto normC2 = infNorm(residualC.get(), 1); + auto normB1 = infNorm(b1.get()); + auto normB2 = infNorm(b2.get()); // make sure that all combined solutions are as good or better than the // single solutions - ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14); - ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14); + ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value); + ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value); // Not sure if this is necessary, the assertions above should cover what is // needed. - GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14); + GKO_ASSERT_MTX_NEAR(xc, mergedRes, r::value); +} + + +TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose()); + auto b = gko::initialize( + {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}), + r::value * 1e3); +} + + +TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = + this->cgs_factory_big->generate(this->mtx_big->conj_transpose()); + auto b = gko::initialize( + {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({-13.0, -49.0, 69.0, -33.0, -82.0, -39.0}), + r::value * 1e3); } diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index 18c3fd69ff5..843ea5a6037 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,25 +30,33 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include + + #include #include #include -#include #include #include -#include +#include #include +#include "core/test/utils.hpp" + + namespace { +template class Fcg : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Fcg<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Fcg; Fcg() : exec(gko::ReferenceExecutor::create()), @@ -61,8 +69,8 @@ class Fcg : public ::testing::Test { gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), mtx_big(gko::initialize( @@ -74,12 +82,12 @@ class Fcg : public ::testing::Test { {5856.0, 3919.5, 3836.5, -132.0, 4373.5, 5678.0}}, exec)), fcg_factory_big( - gko::solver::Fcg<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(100u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)) {} @@ -87,113 +95,148 @@ class Fcg : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; std::shared_ptr mtx_big; - std::unique_ptr::Factory> fcg_factory; - std::unique_ptr::Factory> fcg_factory_big; + std::unique_ptr fcg_factory; + std::unique_ptr fcg_factory_big; }; +TYPED_TEST_CASE(Fcg, gko::test::ValueTypes); -TEST_F(Fcg, SolvesStencilSystem) + +TYPED_TEST(Fcg, SolvesStencilSystem) { - auto solver = fcg_factory->generate(mtx); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory->generate(this->mtx); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value); } -TEST_F(Fcg, SolvesMultipleStencilSystems) +TYPED_TEST(Fcg, SolvesMultipleStencilSystems) { - auto solver = fcg_factory->generate(mtx); - auto b = gko::initialize({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->fcg_factory->generate(this->mtx); + auto b = gko::initialize( + {I{-1.0, 1.0}, I{3.0, 0.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), + r::value); } -TEST_F(Fcg, SolvesStencilSystemUsingAdvancedApply) +TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply) { - auto solver = fcg_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({-1.0, 3.0, 1.0}, exec); - auto x = gko::initialize({0.5, 1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r::value * 1e1); } -TEST_F(Fcg, SolvesMultipleStencilSystemsUsingAdvancedApply) +TYPED_TEST(Fcg, SolvesMultipleStencilSystemsUsingAdvancedApply) { - auto solver = fcg_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({{-1.0, 1.0}, {3.0, 0.0}, {1.0, 1.0}}, exec); - auto x = gko::initialize({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->fcg_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{-1.0, 1.0}, I{3.0, 0.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), + r::value * 1e1); } -TEST_F(Fcg, SolvesBigDenseSystem1) + +TYPED_TEST(Fcg, SolvesBigDenseSystem1) { - auto solver = fcg_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e3); } -TEST_F(Fcg, SolvesBigDenseSystem2) +TYPED_TEST(Fcg, SolvesBigDenseSystem2) { - auto solver = fcg_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), + r::value * 1e3); } -double infNorm(gko::matrix::Dense<> *mat, size_t col = 0) +template +gko::remove_complex infNorm(gko::matrix::Dense *mat, size_t col = 0) { using std::abs; - double norm = 0.0; + using no_cpx_t = gko::remove_complex; + no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { - double absEntry = abs(mat->at(i, col)); + no_cpx_t absEntry = abs(mat->at(i, col)); if (norm < absEntry) norm = absEntry; } return norm; } -TEST_F(Fcg, SolvesMultipleBigDenseSystems) +TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems) { - auto solver = fcg_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( - {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec); + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); auto b2 = gko::initialize( - {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec); + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); - auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); - auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2}); - auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2}); + auto bc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2}); + auto xc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2}); for (size_t i = 0; i < bc->get_size()[0]; ++i) { bc->at(i, 0) = b1->at(i); bc->at(i, 1) = b2->at(i); @@ -205,25 +248,25 @@ TEST_F(Fcg, SolvesMultipleBigDenseSystems) solver->apply(b1.get(), x1.get()); solver->apply(b2.get(), x2.get()); solver->apply(bc.get(), xc.get()); - auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2}); + auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2}); for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) { mergedRes->at(i, 0) = x1->at(i); mergedRes->at(i, 1) = x2->at(i); } - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({-1.0}, exec); + auto alpha = gko::initialize({1.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); - auto residual1 = Mtx::create(exec, b1->get_size()); + auto residual1 = Mtx::create(this->exec, b1->get_size()); residual1->copy_from(b1.get()); - auto residual2 = Mtx::create(exec, b2->get_size()); + auto residual2 = Mtx::create(this->exec, b2->get_size()); residual2->copy_from(b2.get()); - auto residualC = Mtx::create(exec, bc->get_size()); + auto residualC = Mtx::create(this->exec, bc->get_size()); residualC->copy_from(bc.get()); - mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); - mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); - mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); + this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); + this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); + this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); double normS1 = infNorm(residual1.get()); double normS2 = infNorm(residual2.get()); @@ -234,12 +277,46 @@ TEST_F(Fcg, SolvesMultipleBigDenseSystems) // make sure that all combined solutions are as good or better than the // single solutions - ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14); - ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14); + ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value); + ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value); // Not sure if this is necessary, the assertions above should cover what is // needed. - GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14); + GKO_ASSERT_MTX_NEAR(xc, mergedRes, r::value); +} + + +TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory_big->generate(this->mtx_big); + auto b = gko::initialize( + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e3); +} + + +TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->fcg_factory_big->generate(this->mtx_big); + auto b = gko::initialize( + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({81.0, 55.0, 45.0, 5.0, 85.0, -10.0}), + r::value * 1e3); } diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 13d28b641bf..8eb06cf8dce 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,36 +36,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include +#include #include #include #include #include -#include +#include #include +#include "core/test/utils.hpp" + + namespace { +template class Gmres : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Gmres; Gmres() : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)), gmres_factory( - gko::solver::Gmres<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(4u).on(exec), gko::stop::Time::build() .with_time_limit(std::chrono::seconds(6)) .on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), mtx_big(gko::initialize( @@ -77,12 +83,12 @@ class Gmres : public ::testing::Test { {-848.1, -280.5, -381.8, -187.1, 51.2, -176.2}}, exec)), gmres_factory_big( - gko::solver::Gmres<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(100u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)), mtx_medium( @@ -98,115 +104,148 @@ class Gmres : public ::testing::Test { std::shared_ptr mtx; std::shared_ptr mtx_medium; std::shared_ptr mtx_big; - std::unique_ptr::Factory> gmres_factory; - std::unique_ptr::Factory> gmres_factory_big; + std::unique_ptr gmres_factory; + std::unique_ptr gmres_factory_big; }; +TYPED_TEST_CASE(Gmres, gko::test::ValueTypes); + -TEST_F(Gmres, SolvesStencilSystem) +TYPED_TEST(Gmres, SolvesStencilSystem) { - auto solver = gmres_factory->generate(mtx); - auto b = gko::initialize({13.0, 7.0, 1.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->gmres_factory->generate(this->mtx); + auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); } -TEST_F(Gmres, SolvesMultipleStencilSystems) +TYPED_TEST(Gmres, SolvesMultipleStencilSystems) { - auto solver = gmres_factory->generate(mtx); - auto b = gko::initialize({{13.0, 6.0}, {7.0, 4.0}, {1.0, 1.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->gmres_factory->generate(this->mtx); + auto b = gko::initialize( + {I{13.0, 6.0}, I{7.0, 4.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), + r::value * 1e1); } -TEST_F(Gmres, SolvesStencilSystemUsingAdvancedApply) +TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply) { - auto solver = gmres_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({13.0, 7.0, 1.0}, exec); - auto x = gko::initialize({0.5, 1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->gmres_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r::value * 1e1); } -TEST_F(Gmres, SolvesMultipleStencilSystemsUsingAdvancedApply) +TYPED_TEST(Gmres, SolvesMultipleStencilSystemsUsingAdvancedApply) { - auto solver = gmres_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({{13.0, 6.0}, {7.0, 4.0}, {1.0, 1.0}}, exec); - auto x = gko::initialize({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->gmres_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{13.0, 6.0}, I{7.0, 4.0}, I{1.0, 1.0}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), + r::value * 1e1); } -TEST_F(Gmres, SolvesBigDenseSystem1) +TYPED_TEST(Gmres, SolvesBigDenseSystem1) { - auto solver = gmres_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( - {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}), + r::value * 1e3); } -TEST_F(Gmres, SolvesBigDenseSystem2) +TYPED_TEST(Gmres, SolvesBigDenseSystem2) { - auto solver = gmres_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, - exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), + r::value * 1e3); } -double infNorm(gko::matrix::Dense<> *mat, size_t col = 0) +template +gko::remove_complex infNorm(gko::matrix::Dense *mat, size_t col = 0) { using std::abs; - double norm = 0.0; + using no_cpx_t = gko::remove_complex; + no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { - double absEntry = abs(mat->at(i, col)); + no_cpx_t absEntry = abs(mat->at(i, col)); if (norm < absEntry) norm = absEntry; } return norm; } -TEST_F(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) +TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) { - auto solver = gmres_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( - {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, exec); + {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, + this->exec); auto b2 = gko::initialize( - {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, exec); + {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, + this->exec); - auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + auto x1 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto x2 = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); - auto bc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[0], 2}); - auto xc = Mtx::create(exec, gko::dim<2>{mtx_big->get_size()[1], 2}); + auto bc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[0], 2}); + auto xc = + Mtx::create(this->exec, gko::dim<2>{this->mtx_big->get_size()[1], 2}); for (size_t i = 0; i < bc->get_size()[0]; ++i) { bc->at(i, 0) = b1->at(i); bc->at(i, 1) = b2->at(i); @@ -218,88 +257,135 @@ TEST_F(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) solver->apply(b1.get(), x1.get()); solver->apply(b2.get(), x2.get()); solver->apply(bc.get(), xc.get()); - auto mergedRes = Mtx::create(exec, gko::dim<2>{b1->get_size()[0], 2}); + auto mergedRes = Mtx::create(this->exec, gko::dim<2>{b1->get_size()[0], 2}); for (size_t i = 0; i < mergedRes->get_size()[0]; ++i) { mergedRes->at(i, 0) = x1->at(i); mergedRes->at(i, 1) = x2->at(i); } - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({-1.0}, exec); + auto alpha = gko::initialize({1.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); - auto residual1 = Mtx::create(exec, b1->get_size()); + auto residual1 = Mtx::create(this->exec, b1->get_size()); residual1->copy_from(b1.get()); - auto residual2 = Mtx::create(exec, b2->get_size()); + auto residual2 = Mtx::create(this->exec, b2->get_size()); residual2->copy_from(b2.get()); - auto residualC = Mtx::create(exec, bc->get_size()); + auto residualC = Mtx::create(this->exec, bc->get_size()); residualC->copy_from(bc.get()); - mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); - mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); - mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); + this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); + this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); + this->mtx_big->apply(alpha.get(), xc.get(), beta.get(), residualC.get()); - double normS1 = infNorm(residual1.get()); - double normS2 = infNorm(residual2.get()); - double normC1 = infNorm(residualC.get(), 0); - double normC2 = infNorm(residualC.get(), 1); - double normB1 = infNorm(b1.get()); - double normB2 = infNorm(b2.get()); + auto normS1 = infNorm(residual1.get()); + auto normS2 = infNorm(residual2.get()); + auto normC1 = infNorm(residualC.get(), 0); + auto normC2 = infNorm(residualC.get(), 1); + auto normB1 = infNorm(b1.get()); + auto normB2 = infNorm(b2.get()); // make sure that all combined solutions are as good or better than the // single solutions - ASSERT_LE(normC1 / normB1, normS1 / normB1 + 1e-14); - ASSERT_LE(normC2 / normB2, normS2 / normB2 + 1e-14); + ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value); + ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value); // Not sure if this is necessary, the assertions above should cover what is // needed. - GKO_ASSERT_MTX_NEAR(xc, mergedRes, 1e-14); + GKO_ASSERT_MTX_NEAR(xc, mergedRes, r::value); } -TEST_F(Gmres, SolvesBigDenseSystem1WithRestart) +TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) { + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; + auto half_tol = std::sqrt(r::value); auto gmres_factory_restart = - gko::solver::Gmres<>::build() + Solver::build() .with_krylov_dim(4u) .with_criteria( - gko::stop::Iteration::build().with_max_iters(200u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) - .on(exec)) - .on(exec); - auto solver = gmres_factory_restart->generate(mtx_medium); + gko::stop::Iteration::build().with_max_iters(200u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .on(this->exec); + auto solver = gmres_factory_restart->generate(this->mtx_medium); auto b = gko::initialize( - {-13945.16, 11205.66, 16132.96, 24342.18, -10910.98}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0}, exec); + {-13945.16, 11205.66, 16132.96, 24342.18, -10910.98}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-140.20, -142.20, 48.80, -17.70, -19.60}), 1e-5); + GKO_ASSERT_MTX_NEAR(x, l({-140.20, -142.20, 48.80, -17.70, -19.60}), + half_tol * 1e2); } -TEST_F(Gmres, SolvesWithPreconditioner) +TYPED_TEST(Gmres, SolvesWithPreconditioner) { + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + using value_type = typename TestFixture::value_type; auto gmres_factory_preconditioner = - gko::solver::Gmres<>::build() + Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on(exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) - .on(exec)) - .with_preconditioner(gko::preconditioner::Jacobi<>::build() - .with_max_block_size(3u) - .on(exec)) - .on(exec); - auto solver = gmres_factory_preconditioner->generate(mtx_big); + gko::stop::Iteration::build().with_max_iters(100u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_preconditioner( + gko::preconditioner::Jacobi::build() + .with_max_block_size(3u) + .on(this->exec)) + .on(this->exec); + auto solver = gmres_factory_preconditioner->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, - exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, exec); + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), + r::value * 1e3); +} + + +TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose()); + auto b = gko::initialize( + {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}), + r::value * 1e3); +} + + +TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = + this->gmres_factory_big->generate(this->mtx_big->conj_transpose()); + auto b = gko::initialize( + {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, + this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({52.7, 85.4, 134.2, -250.0, -16.8, 35.3}), + r::value * 1e3); } diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index 18e3c3cf9b2..208d4f6b9d9 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -36,93 +36,277 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include +#include #include #include -#include +#include + + +#include "core/test/utils.hpp" namespace { +template class Ir : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = T; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Ir; Ir() : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( {{0.9, -1.0, 3.0}, {0.0, 1.0, 3.0}, {0.0, 0.0, 1.1}}, exec)), // Eigenvalues of mtx are 0.9, 1.0 and 1.1 - // Richardson iteration, converges since | lambda - 1 | < 1 + // Richardson iteration, converges since + // | relaxation_factor * lambda - 1 | < 1 ir_factory( - gko::solver::Ir<>::build() + Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(30u).on( exec), - gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(1e-15) + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) .on(exec)) .on(exec)) {} std::shared_ptr exec; std::shared_ptr mtx; - std::unique_ptr::Factory> ir_factory; + std::unique_ptr ir_factory; }; +TYPED_TEST_CASE(Ir, gko::test::ValueTypes); + -TEST_F(Ir, SolvesTriangularSystem) +TYPED_TEST(Ir, SolvesTriangularSystem) { - auto solver = ir_factory->generate(mtx); - auto b = gko::initialize({3.9, 9.0, 2.2}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->ir_factory->generate(this->mtx); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); +} + + +TYPED_TEST(Ir, SolvesTriangularSystemWithIterativeInnerSolver) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + + const gko::remove_complex inner_reduction_factor = 1e-2; + auto inner_solver_factory = + gko::solver::Gmres::build() + .with_criteria(gko::stop::ResidualNormReduction::build() + .with_reduction_factor(inner_reduction_factor) + .on(this->exec)) + .on(this->exec); + + auto solver_factory = + gko::solver::Ir::build() + .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_solver(gko::share(inner_solver_factory)) + .on(this->exec); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver_factory->generate(this->mtx)->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); } -TEST_F(Ir, SolvesMultipleTriangularSystems) +TYPED_TEST(Ir, SolvesMultipleTriangularSystems) { - auto solver = ir_factory->generate(mtx); - auto b = gko::initialize({{3.9, 2.9}, {9.0, 4.0}, {2.2, 1.1}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->ir_factory->generate(this->mtx); + auto b = gko::initialize( + {I{3.9, 2.9}, I{9.0, 4.0}, I{2.2, 1.1}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.0, 1.0}, {3.0, 1.0}, {2.0, 1.0}}), + r::value * 1e1); } -TEST_F(Ir, SolvesTriangularSystemUsingAdvancedApply) +TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApply) { - auto solver = ir_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({3.9, 9.0, 2.2}, exec); - auto x = gko::initialize({0.5, 1.0, 2.0}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->ir_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.5, 1.0, 2.0}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), r::value); } -TEST_F(Ir, SolvesMultipleStencilSystemsUsingAdvancedApply) +TYPED_TEST(Ir, SolvesMultipleStencilSystemsUsingAdvancedApply) { - auto solver = ir_factory->generate(mtx); - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - auto b = gko::initialize({{3.9, 2.9}, {9.0, 4.0}, {2.2, 1.1}}, exec); - auto x = gko::initialize({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto solver = this->ir_factory->generate(this->mtx); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{3.9, 2.9}, I{9.0, 4.0}, I{2.2, 1.1}}, this->exec); + auto x = gko::initialize( + {I{0.5, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{1.5, 1.0}, {5.0, 0.0}, {2.0, -1.0}}), + r::value * 1e1); +} + + +TYPED_TEST(Ir, SolvesTransposedTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->ir_factory->generate(this->mtx->transpose()); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); +} + + +TYPED_TEST(Ir, SolvesConjTransposedTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = this->ir_factory->generate(this->mtx->conj_transpose()); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); +} + + +TYPED_TEST(Ir, RichardsonSolvesTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = gko::solver::Ir::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(100u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_relaxation_factor(value_type{0.9}) + .on(this->exec) + ->generate(this->mtx); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); +} + + +TYPED_TEST(Ir, RichardsonSolvesTriangularSystemWithIterativeInnerSolver) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + const gko::remove_complex inner_reduction_factor = 1e-2; + auto inner_solver_factory = + gko::solver::Gmres::build() + .with_criteria(gko::stop::ResidualNormReduction::build() + .with_reduction_factor(inner_reduction_factor) + .on(this->exec)) + .on(this->exec); + auto solver_factory = + gko::solver::Ir::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(100u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_relaxation_factor(value_type{0.9}) + .with_solver(gko::share(inner_solver_factory)) + .on(this->exec); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver_factory->generate(this->mtx)->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); +} + + +TYPED_TEST(Ir, RichardsonTransposedSolvesTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = + gko::solver::Ir::build() + .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_relaxation_factor(value_type{0.9}) + .on(this->exec) + ->generate(this->mtx->transpose()); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); +} + + +TYPED_TEST(Ir, RichardsonConjTransposedSolvesTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto solver = + gko::solver::Ir::build() + .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on( + this->exec), + gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_relaxation_factor(value_type{0.9}) + .on(this->exec) + ->generate(this->mtx->conj_transpose()); + auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), r::value * 1e1); } diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp index 465b576a107..65a7aab0d42 100644 --- a/reference/test/solver/lower_trs.cpp +++ b/reference/test/solver/lower_trs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44,17 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class LowerTrs : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using CsrMtx = gko::matrix::Csr<>; - using Solver = gko::solver::LowerTrs<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Dense; + using CsrMtx = gko::matrix::Csr; + using Solver = gko::solver::LowerTrs; LowerTrs() : exec(gko::ReferenceExecutor::create()), @@ -69,64 +74,72 @@ class LowerTrs : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; std::shared_ptr csr_mtx; - std::unique_ptr lower_trs_factory; + std::unique_ptr lower_trs_factory; std::unique_ptr solver; }; +TYPED_TEST_CASE(LowerTrs, gko::test::ValueIndexTypes); -TEST_F(LowerTrs, LowerTrsFactoryCreatesCorrectSolver) + +TYPED_TEST(LowerTrs, LowerTrsFactoryCreatesCorrectSolver) { - auto sys_mtx = solver->get_system_matrix(); + auto sys_mtx = this->solver->get_system_matrix(); - ASSERT_EQ(solver->get_size(), gko::dim<2>(3, 3)); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(3, 3)); ASSERT_NE(sys_mtx, nullptr); - GKO_ASSERT_MTX_NEAR(sys_mtx, csr_mtx, 0); + GKO_ASSERT_MTX_NEAR(sys_mtx, this->csr_mtx, 0); } -TEST_F(LowerTrs, CanBeCopied) +TYPED_TEST(LowerTrs, CanBeCopied) { - auto copy = Solver::build().on(exec)->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = + Solver::build().on(this->exec)->generate(Mtx::create(this->exec)); - copy->copy_from(gko::lend(solver)); + copy->copy_from(gko::lend(this->solver)); auto copy_mtx = copy->get_system_matrix(); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); - GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0); + GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0); } -TEST_F(LowerTrs, CanBeMoved) +TYPED_TEST(LowerTrs, CanBeMoved) { - auto copy = Solver::build().on(exec)->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = + Solver::build().on(this->exec)->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(solver)); + copy->copy_from(std::move(this->solver)); auto copy_mtx = copy->get_system_matrix(); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); - GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0); + GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0); } -TEST_F(LowerTrs, CanBeCloned) +TYPED_TEST(LowerTrs, CanBeCloned) { - auto clone = solver->clone(); + auto clone = this->solver->clone(); auto clone_mtx = clone->get_system_matrix(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); - GKO_ASSERT_MTX_NEAR(clone_mtx.get(), csr_mtx.get(), 0); + GKO_ASSERT_MTX_NEAR(clone_mtx.get(), this->csr_mtx.get(), 0); } -TEST_F(LowerTrs, CanBeCleared) +TYPED_TEST(LowerTrs, CanBeCleared) { - solver->clear(); + this->solver->clear(); - auto solver_mtx = solver->get_system_matrix(); + auto solver_mtx = this->solver->get_system_matrix(); ASSERT_EQ(solver_mtx, nullptr); - ASSERT_EQ(solver->get_size(), gko::dim<2>(0, 0)); + ASSERT_EQ(this->solver->get_size(), gko::dim<2>(0, 0)); } diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index 22ba58a8912..6379dca192d 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45,20 +45,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include #include "core/solver/lower_trs_kernels.hpp" -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class LowerTrs : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::LowerTrs; LowerTrs() : exec(gko::ReferenceExecutor::create()), ref(gko::ReferenceExecutor::create()), @@ -66,16 +72,15 @@ class LowerTrs : public ::testing::Test { {{1, 0.0, 0.0}, {3.0, 1, 0.0}, {1.0, 2.0, 1}}, exec)), mtx2(gko::initialize( {{2, 0.0, 0.0}, {3.0, 3, 0.0}, {1.0, 2.0, 4}}, exec)), - lower_trs_factory(gko::solver::LowerTrs<>::build().on(exec)), - lower_trs_factory_mrhs( - gko::solver::LowerTrs<>::build().with_num_rhs(2u).on(exec)), + lower_trs_factory(Solver::build().on(exec)), + lower_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)), mtx_big(gko::initialize({{124.0, 0.0, 0.0, 0.0, 0.0}, {43.0, -789.0, 0.0, 0.0, 0.0}, {134.5, -651.0, 654.0, 0.0, 0.0}, {-642.0, 684.0, 68.0, 387.0, 0.0}, {365.0, 97.0, -654.0, 8.0, 91.0}}, exec)), - lower_trs_factory_big(gko::solver::LowerTrs<>::build().on(exec)) + lower_trs_factory_big(Solver::build().on(exec)) {} std::shared_ptr exec; @@ -83,101 +88,149 @@ class LowerTrs : public ::testing::Test { std::shared_ptr mtx; std::shared_ptr mtx2; std::shared_ptr mtx_big; - std::unique_ptr::Factory> lower_trs_factory; - std::unique_ptr::Factory> lower_trs_factory_mrhs; - std::unique_ptr::Factory> lower_trs_factory_big; + std::unique_ptr lower_trs_factory; + std::unique_ptr lower_trs_factory_mrhs; + std::unique_ptr lower_trs_factory_big; }; +TYPED_TEST_CASE(LowerTrs, gko::test::ValueIndexTypes); -TEST_F(LowerTrs, RefLowerTrsFlagCheckIsCorrect) + +TYPED_TEST(LowerTrs, RefLowerTrsFlagCheckIsCorrect) { bool trans_flag = true; bool expected_flag = false; - gko::kernels::reference::lower_trs::should_perform_transpose(ref, + gko::kernels::reference::lower_trs::should_perform_transpose(this->ref, trans_flag); ASSERT_EQ(expected_flag, trans_flag); } -TEST_F(LowerTrs, SolvesTriangularSystem) +TYPED_TEST(LowerTrs, SolvesTriangularSystem) { - std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); - auto solver = lower_trs_factory->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_factory->generate(this->mtx); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 2.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 2.0}), r::value); } -TEST_F(LowerTrs, SolvesMultipleTriangularSystems) +TYPED_TEST(LowerTrs, SolvesMultipleTriangularSystems) { - std::shared_ptr b = - gko::initialize({{3.0, 4.0}, {1.0, 0.0}, {1.0, -1.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); - auto solver = lower_trs_factory_mrhs->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + std::shared_ptr b = gko::initialize( + {I{3.0, 4.0}, I{1.0, 0.0}, I{1.0, -1.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); + auto solver = this->lower_trs_factory_mrhs->generate(this->mtx); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{3.0, 4.0}, {-8.0, -12.0}, {14.0, 19.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{3.0, 4.0}, {-8.0, -12.0}, {14.0, 19.0}}), + r::value); } -TEST_F(LowerTrs, SolvesNonUnitTriangularSystem) +TYPED_TEST(LowerTrs, SolvesNonUnitTriangularSystem) { - std::shared_ptr b = gko::initialize({2.0, 12.0, 3.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); - auto solver = lower_trs_factory->generate(mtx2); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({2.0, 12.0, 3.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_factory->generate(this->mtx2); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), r::value); } -TEST_F(LowerTrs, SolvesTriangularSystemUsingAdvancedApply) + +TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply) { - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, exec); - auto x = gko::initialize({1.0, -1.0, 1.0}, exec); - auto solver = lower_trs_factory->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); + auto x = gko::initialize({1.0, -1.0, 1.0}, this->exec); + auto solver = this->lower_trs_factory->generate(this->mtx); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 3.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, -1.0, 3.0}), r::value); } -TEST_F(LowerTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply) +TYPED_TEST(LowerTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - std::shared_ptr b = - gko::initialize({{3.0, 4.0}, {1.0, 0.0}, {1.0, -1.0}}, exec); - auto x = - gko::initialize({{1.0, 2.0}, {-1.0, -1.0}, {0.0, -2.0}}, exec); - auto solver = lower_trs_factory_mrhs->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + std::shared_ptr b = gko::initialize( + {I{3.0, 4.0}, I{1.0, 0.0}, I{1.0, -1.0}}, this->exec); + auto x = gko::initialize( + {I{1.0, 2.0}, I{-1.0, -1.0}, I{0.0, -2.0}}, this->exec); + auto solver = this->lower_trs_factory_mrhs->generate(this->mtx); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({{-1.0, 0.0}, {6.0, 10.0}, {-14.0, -23.0}}), - 1e-14); + r::value); } -TEST_F(LowerTrs, SolvesBigDenseSystem) +TYPED_TEST(LowerTrs, SolvesBigDenseSystem) { - std::shared_ptr b = - gko::initialize({-124.0, -3199.0, 3147.5, 5151.0, -6021.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto solver = lower_trs_factory_big->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize( + {-124.0, -3199.0, 3147.5, 5151.0, -6021.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_factory_big->generate(this->mtx_big); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}), + r::value * 1e3); +} + + +TYPED_TEST(LowerTrs, SolvesTransposedTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_factory->generate(this->mtx); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r::value); +} + + +TYPED_TEST(LowerTrs, SolvesConjTransposedTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_factory->generate(this->mtx); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r::value); } diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp index 6f07b6960a4..178fa5aff3c 100644 --- a/reference/test/solver/upper_trs.cpp +++ b/reference/test/solver/upper_trs.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44,17 +44,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class UpperTrs : public ::testing::Test { protected: - using CsrMtx = gko::matrix::Csr; - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::UpperTrs<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using CsrMtx = gko::matrix::Csr; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::UpperTrs; UpperTrs() : exec(gko::ReferenceExecutor::create()), @@ -69,63 +74,69 @@ class UpperTrs : public ::testing::Test { std::shared_ptr exec; std::shared_ptr mtx; std::shared_ptr csr_mtx; - std::unique_ptr upper_trs_factory; + std::unique_ptr upper_trs_factory; std::unique_ptr upper_trs_solver; }; +TYPED_TEST_CASE(UpperTrs, gko::test::ValueIndexTypes); -TEST_F(UpperTrs, UpperTrsFactoryCreatesCorrectSolver) + +TYPED_TEST(UpperTrs, UpperTrsFactoryCreatesCorrectSolver) { - auto sys_mtx = upper_trs_solver->get_system_matrix(); + auto sys_mtx = this->upper_trs_solver->get_system_matrix(); - ASSERT_EQ(upper_trs_solver->get_size(), gko::dim<2>(3, 3)); + ASSERT_EQ(this->upper_trs_solver->get_size(), gko::dim<2>(3, 3)); ASSERT_NE(sys_mtx, nullptr); - GKO_ASSERT_MTX_NEAR(sys_mtx, csr_mtx, 0); + GKO_ASSERT_MTX_NEAR(sys_mtx, this->csr_mtx, 0); } -TEST_F(UpperTrs, CanBeCopied) +TYPED_TEST(UpperTrs, CanBeCopied) { - auto copy = Solver::build().on(exec)->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + using Solver = typename TestFixture::Solver; + auto copy = + Solver::build().on(this->exec)->generate(Mtx::create(this->exec)); - copy->copy_from(gko::lend(upper_trs_solver)); + copy->copy_from(gko::lend(this->upper_trs_solver)); auto copy_mtx = copy->get_system_matrix(); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); - GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0); + GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0); } -TEST_F(UpperTrs, CanBeMoved) +TYPED_TEST(UpperTrs, CanBeMoved) { - auto copy = upper_trs_factory->generate(Mtx::create(exec)); + using Mtx = typename TestFixture::Mtx; + auto copy = this->upper_trs_factory->generate(Mtx::create(this->exec)); - copy->copy_from(std::move(upper_trs_solver)); + copy->copy_from(std::move(this->upper_trs_solver)); auto copy_mtx = copy->get_system_matrix(); ASSERT_EQ(copy->get_size(), gko::dim<2>(3, 3)); - GKO_ASSERT_MTX_NEAR(copy_mtx.get(), csr_mtx.get(), 0); + GKO_ASSERT_MTX_NEAR(copy_mtx.get(), this->csr_mtx.get(), 0); } -TEST_F(UpperTrs, CanBeCloned) +TYPED_TEST(UpperTrs, CanBeCloned) { - auto clone = upper_trs_solver->clone(); + auto clone = this->upper_trs_solver->clone(); auto clone_mtx = clone->get_system_matrix(); ASSERT_EQ(clone->get_size(), gko::dim<2>(3, 3)); - GKO_ASSERT_MTX_NEAR(clone_mtx.get(), csr_mtx.get(), 0); + GKO_ASSERT_MTX_NEAR(clone_mtx.get(), this->csr_mtx.get(), 0); } -TEST_F(UpperTrs, CanBeCleared) +TYPED_TEST(UpperTrs, CanBeCleared) { - upper_trs_solver->clear(); + this->upper_trs_solver->clear(); - auto solver_mtx = upper_trs_solver->get_system_matrix(); + auto solver_mtx = this->upper_trs_solver->get_system_matrix(); - ASSERT_EQ(upper_trs_solver->get_size(), gko::dim<2>(0, 0)); + ASSERT_EQ(this->upper_trs_solver->get_size(), gko::dim<2>(0, 0)); ASSERT_EQ(solver_mtx, nullptr); } diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index a9863054b94..81da3158442 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45,20 +45,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include #include "core/solver/upper_trs_kernels.hpp" -#include "core/test/utils/assertions.hpp" +#include "core/test/utils.hpp" namespace { +template class UpperTrs : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::UpperTrs; UpperTrs() : exec(gko::ReferenceExecutor::create()), ref(gko::ReferenceExecutor::create()), @@ -66,9 +72,8 @@ class UpperTrs : public ::testing::Test { {{1, 3.0, 1.0}, {0.0, 1, 2.0}, {0.0, 0.0, 1}}, exec)), mtx2(gko::initialize( {{2, 3.0, 1.0}, {0.0, 3, 2.0}, {0.0, 0.0, 4}}, exec)), - upper_trs_factory(gko::solver::UpperTrs<>::build().on(exec)), - upper_trs_factory_mrhs( - gko::solver::UpperTrs<>::build().with_num_rhs(2u).on(exec)), + upper_trs_factory(Solver::build().on(exec)), + upper_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)), mtx_big(gko::initialize({{365.0, 97.0, -654.0, 8.0, 91.0}, {0.0, -642.0, 684.0, 68.0, 387.0}, {0.0, 0.0, 134, -651.0, 654.0}, @@ -82,100 +87,149 @@ class UpperTrs : public ::testing::Test { std::shared_ptr mtx; std::shared_ptr mtx2; std::shared_ptr mtx_big; - std::unique_ptr::Factory> upper_trs_factory; - std::unique_ptr::Factory> upper_trs_factory_mrhs; + std::unique_ptr upper_trs_factory; + std::unique_ptr upper_trs_factory_mrhs; }; +TYPED_TEST_CASE(UpperTrs, gko::test::ValueIndexTypes); -TEST_F(UpperTrs, RefUpperTrsFlagCheckIsCorrect) + +TYPED_TEST(UpperTrs, RefUpperTrsFlagCheckIsCorrect) { bool trans_flag = true; bool expected_flag = false; - gko::kernels::reference::upper_trs::should_perform_transpose(ref, + gko::kernels::reference::upper_trs::should_perform_transpose(this->ref, trans_flag); ASSERT_EQ(expected_flag, trans_flag); } -TEST_F(UpperTrs, SolvesTriangularSystem) +TYPED_TEST(UpperTrs, SolvesTriangularSystem) { - std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); - auto solver = upper_trs_factory->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_factory->generate(this->mtx); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({13.0, -4.0, 3.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({13.0, -4.0, 3.0}), r::value); } -TEST_F(UpperTrs, SolvesMultipleTriangularSystems) +TYPED_TEST(UpperTrs, SolvesMultipleTriangularSystems) { - std::shared_ptr b = - gko::initialize({{4.0, 2.0}, {2.0, 1.0}, {3.0, -1.0}}, exec); - auto x = gko::initialize({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec); - auto solver = upper_trs_factory_mrhs->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + std::shared_ptr b = gko::initialize( + {I{4.0, 2.0}, I{2.0, 1.0}, I{3.0, -1.0}}, this->exec); + auto x = gko::initialize( + {I{0.0, 0.0}, I{0.0, 0.0}, I{0.0, 0.0}}, this->exec); + auto solver = this->upper_trs_factory_mrhs->generate(this->mtx); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{13.0, -6.0}, {-4.0, 3.0}, {3.0, -1.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{13.0, -6.0}, {-4.0, 3.0}, {3.0, -1.0}}), + r::value); } -TEST_F(UpperTrs, SolvesNonUnitTriangularSystem) +TYPED_TEST(UpperTrs, SolvesNonUnitTriangularSystem) { - std::shared_ptr b = gko::initialize({10.0, 7.0, -4.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0}, exec); - auto solver = upper_trs_factory->generate(mtx2); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = + gko::initialize({10.0, 7.0, -4.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_factory->generate(this->mtx2); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, -1.0}), r::value); } -TEST_F(UpperTrs, SolvesTriangularSystemUsingAdvancedApply) +TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply) { - auto alpha = gko::initialize({2.0}, exec); - auto beta = gko::initialize({-1.0}, exec); - std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, exec); - auto x = gko::initialize({1.0, -1.0, 1.0}, exec); - auto solver = upper_trs_factory->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); + auto x = gko::initialize({1.0, -1.0, 1.0}, this->exec); + auto solver = this->upper_trs_factory->generate(this->mtx); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({25.0, -7.0, 5.0}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({25.0, -7.0, 5.0}), r::value); } -TEST_F(UpperTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply) +TYPED_TEST(UpperTrs, SolvesMultipleTriangularSystemsUsingAdvancedApply) { - auto alpha = gko::initialize({-1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - std::shared_ptr b = - gko::initialize({{4.0, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec); - auto x = - gko::initialize({{1.0, 2.0}, {-1.0, -1.0}, {1.0, -2.0}}, exec); - auto solver = upper_trs_factory_mrhs->generate(mtx); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + std::shared_ptr b = gko::initialize( + {I{4.0, 1.0}, I{1.0, 2.0}, I{2.0, 3.0}}, this->exec); + auto x = gko::initialize( + {I{1.0, 2.0}, I{-1.0, -1.0}, I{1.0, -2.0}}, this->exec); + auto solver = this->upper_trs_factory_mrhs->generate(this->mtx); solver->apply(alpha.get(), b.get(), beta.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({{-9.0, -6.0}, {1.0, 2.0}, {0.0, -7.0}}), 1e-14); + GKO_ASSERT_MTX_NEAR(x, l({{-9.0, -6.0}, {1.0, 2.0}, {0.0, -7.0}}), + r::value); } -TEST_F(UpperTrs, SolvesBigDenseSystem) +TYPED_TEST(UpperTrs, SolvesBigDenseSystem) { - std::shared_ptr b = - gko::initialize({-6021.0, 3018.0, -2055.0, 1707.0, -248.0}, exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0}, exec); - auto solver = upper_trs_factory->generate(mtx_big); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize( + {-6021.0, 3018.0, -2055.0, 1707.0, -248.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_factory->generate(this->mtx_big); solver->apply(b.get(), x.get()); - GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}), 1e-10); + GKO_ASSERT_MTX_NEAR(x, l({-1.0, 4.0, 9.0, 3.0, -2.0}), + r::value * 1e3); +} + + +TYPED_TEST(UpperTrs, SolvesTransposedTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_factory->generate(this->mtx); + + solver->transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r::value); +} + + +TYPED_TEST(UpperTrs, SolvesConjTransposedTriangularSystem) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_factory->generate(this->mtx); + + solver->conj_transpose()->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r::value); } diff --git a/reference/test/stop/CMakeLists.txt b/reference/test/stop/CMakeLists.txt index 771a14696dd..fb27a843369 100644 --- a/reference/test/stop/CMakeLists.txt +++ b/reference/test/stop/CMakeLists.txt @@ -1,5 +1,5 @@ ginkgo_create_test(combined) ginkgo_create_test(criterion_kernels) ginkgo_create_test(iteration) -ginkgo_create_test(residual_norm_reduction_kernels) +ginkgo_create_test(residual_norm_kernels) ginkgo_create_test(time) diff --git a/reference/test/stop/combined.cpp b/reference/test/stop/combined.cpp index 9f8629b0068..08939c64392 100644 --- a/reference/test/stop/combined.cpp +++ b/reference/test/stop/combined.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,11 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include - - -#include #include #include #if defined(_WIN32) || defined(__CYGWIN__) @@ -45,6 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#include + + +#include +#include + + namespace { diff --git a/reference/test/stop/criterion_kernels.cpp b/reference/test/stop/criterion_kernels.cpp index 328dc79f4d1..b2fa160f8e3 100644 --- a/reference/test/stop/criterion_kernels.cpp +++ b/reference/test/stop/criterion_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,12 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include -#include #include +#include + + namespace { diff --git a/reference/test/stop/iteration.cpp b/reference/test/stop/iteration.cpp index f2c8637c863..fd9d34114b5 100644 --- a/reference/test/stop/iteration.cpp +++ b/reference/test/stop/iteration.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp new file mode 100644 index 00000000000..bf2de2bc20c --- /dev/null +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -0,0 +1,431 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class ResidualNormReduction : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + + ResidualNormReduction() + { + exec_ = gko::ReferenceExecutor::create(); + factory_ = gko::stop::ResidualNormReduction::build() + .with_reduction_factor(r::value) + .on(exec_); + } + + std::unique_ptr::Factory> + factory_; + std::shared_ptr exec_; +}; + +TYPED_TEST_CASE(ResidualNormReduction, gko::test::ValueTypes); + + +TYPED_TEST(ResidualNormReduction, CanCreateFactory) +{ + ASSERT_NE(this->factory_, nullptr); + ASSERT_EQ(this->factory_->get_parameters().reduction_factor, + r::value); + ASSERT_EQ(this->factory_->get_executor(), this->exec_); +} + + +TYPED_TEST(ResidualNormReduction, CannotCreateCriterionWithoutB) +{ + ASSERT_THROW(this->factory_->generate(nullptr, nullptr, nullptr, nullptr), + gko::NotSupported); +} + + +TYPED_TEST(ResidualNormReduction, CanCreateCriterionWithB) +{ + using Mtx = typename TestFixture::Mtx; + std::shared_ptr scalar = + gko::initialize({1.0}, this->exec_); + auto criterion = + this->factory_->generate(nullptr, nullptr, nullptr, scalar.get()); + ASSERT_NE(criterion, nullptr); +} + + +TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoal) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + auto initial_res = gko::initialize({100.0}, this->exec_); + std::shared_ptr rhs = gko::initialize({10.0}, this->exec_); + auto res_norm = gko::initialize({100.0}, this->exec_); + auto criterion = + this->factory_->generate(nullptr, rhs, nullptr, initial_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->exec_, 1); + stop_status.get_data()[0].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0) = r::value * 1.1e+2; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); + ASSERT_EQ(one_changed, false); + + res_norm->at(0) = r::value * 0.9e+2; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +TYPED_TEST(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + using T = TypeParam; + using T_nc = gko::remove_complex; + auto res = gko::initialize({I{100.0, 100.0}}, this->exec_); + auto res_norm = + gko::initialize({I{100.0, 100.0}}, this->exec_); + std::shared_ptr rhs = + gko::initialize({I{10.0, 10.0}}, this->exec_); + auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->exec_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0, 0) = r::value * 0.9e+2; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); + + res_norm->at(0, 1) = r::value * 0.9e+2; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +template +class RelativeResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + + RelativeResidualNorm() + { + exec_ = gko::ReferenceExecutor::create(); + factory_ = gko::stop::RelativeResidualNorm::build() + .with_tolerance(r::value) + .on(exec_); + } + + std::unique_ptr::Factory> + factory_; + std::shared_ptr exec_; +}; + +TYPED_TEST_CASE(RelativeResidualNorm, gko::test::ValueTypes); + + +TYPED_TEST(RelativeResidualNorm, CanCreateFactory) +{ + ASSERT_NE(this->factory_, nullptr); + ASSERT_EQ(this->factory_->get_parameters().tolerance, r::value); + ASSERT_EQ(this->factory_->get_executor(), this->exec_); +} + + +TYPED_TEST(RelativeResidualNorm, CannotCreateCriterionWithoutB) +{ + ASSERT_THROW(this->factory_->generate(nullptr, nullptr, nullptr, nullptr), + gko::NotSupported); +} + + +TYPED_TEST(RelativeResidualNorm, CanCreateCriterionWithB) +{ + using Mtx = typename TestFixture::Mtx; + std::shared_ptr scalar = + gko::initialize({1.0}, this->exec_); + auto criterion = + this->factory_->generate(nullptr, scalar, nullptr, nullptr); + ASSERT_NE(criterion, nullptr); +} + + +TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoal) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + auto initial_res = gko::initialize({100.0}, this->exec_); + std::shared_ptr rhs = gko::initialize({10.0}, this->exec_); + auto res_norm = gko::initialize({100.0}, this->exec_); + auto criterion = + this->factory_->generate(nullptr, rhs, nullptr, initial_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->exec_, 1); + stop_status.get_data()[0].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0) = r::value * 1.1e+1; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); + ASSERT_EQ(one_changed, false); + + res_norm->at(0) = r::value * 0.9e+1; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +TYPED_TEST(RelativeResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + using T = TypeParam; + using T_nc = gko::remove_complex; + auto res = gko::initialize({I{100.0, 100.0}}, this->exec_); + auto res_norm = + gko::initialize({I{100.0, 100.0}}, this->exec_); + std::shared_ptr rhs = + gko::initialize({I{10.0, 10.0}}, this->exec_); + auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->exec_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0, 0) = r::value * 0.9e+1; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); + + res_norm->at(0, 1) = r::value * 0.9e+1; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +template +class AbsoluteResidualNorm : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + + AbsoluteResidualNorm() + { + exec_ = gko::ReferenceExecutor::create(); + factory_ = gko::stop::AbsoluteResidualNorm::build() + .with_tolerance(r::value) + .on(exec_); + } + + std::unique_ptr::Factory> + factory_; + std::shared_ptr exec_; +}; + +TYPED_TEST_CASE(AbsoluteResidualNorm, gko::test::ValueTypes); + + +TYPED_TEST(AbsoluteResidualNorm, CanCreateFactory) +{ + ASSERT_NE(this->factory_, nullptr); + ASSERT_EQ(this->factory_->get_parameters().tolerance, r::value); + ASSERT_EQ(this->factory_->get_executor(), this->exec_); +} + + +TYPED_TEST(AbsoluteResidualNorm, CannotCreateCriterionWithoutB) +{ + ASSERT_THROW(this->factory_->generate(nullptr, nullptr, nullptr, nullptr), + gko::NotSupported); +} + + +TYPED_TEST(AbsoluteResidualNorm, CanCreateCriterionWithB) +{ + using Mtx = typename TestFixture::Mtx; + std::shared_ptr scalar = + gko::initialize({1.0}, this->exec_); + auto criterion = + this->factory_->generate(nullptr, scalar, nullptr, nullptr); + ASSERT_NE(criterion, nullptr); +} + + +TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoal) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + auto initial_res = gko::initialize({100.0}, this->exec_); + std::shared_ptr rhs = gko::initialize({10.0}, this->exec_); + auto res_norm = gko::initialize({100.0}, this->exec_); + auto criterion = + this->factory_->generate(nullptr, rhs, nullptr, initial_res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->exec_, 1); + stop_status.get_data()[0].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0) = r::value * 1.1; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); + ASSERT_EQ(one_changed, false); + + res_norm->at(0) = r::value * 0.9; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +TYPED_TEST(AbsoluteResidualNorm, WaitsTillResidualGoalMultipleRHS) +{ + using Mtx = typename TestFixture::Mtx; + using NormVector = typename TestFixture::NormVector; + using T = TypeParam; + using T_nc = gko::remove_complex; + auto res = gko::initialize({I{100.0, 100.0}}, this->exec_); + auto res_norm = + gko::initialize({I{100.0, 100.0}}, this->exec_); + std::shared_ptr rhs = + gko::initialize({I{10.0, 10.0}}, this->exec_); + auto criterion = this->factory_->generate(nullptr, rhs, nullptr, res.get()); + bool one_changed{}; + constexpr gko::uint8 RelativeStoppingId{1}; + gko::Array stop_status(this->exec_, 2); + stop_status.get_data()[0].reset(); + stop_status.get_data()[1].reset(); + + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + + res_norm->at(0, 0) = r::value * 0.9; + ASSERT_FALSE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); + ASSERT_EQ(one_changed, true); + + res_norm->at(0, 1) = r::value * 0.9; + ASSERT_TRUE( + criterion->update() + .residual_norm(res_norm.get()) + .check(RelativeStoppingId, true, &stop_status, &one_changed)); + ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); + ASSERT_EQ(one_changed, true); +} + + +} // namespace diff --git a/reference/test/stop/residual_norm_reduction_kernels.cpp b/reference/test/stop/residual_norm_reduction_kernels.cpp deleted file mode 100644 index c326280ae5e..00000000000 --- a/reference/test/stop/residual_norm_reduction_kernels.cpp +++ /dev/null @@ -1,152 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include - - -namespace { - - -constexpr double reduction_factor = 1.0e-14; - - -class ResidualNormReduction : public ::testing::Test { -protected: - using Mtx = gko::matrix::Dense<>; - - ResidualNormReduction() - { - exec_ = gko::ReferenceExecutor::create(); - factory_ = gko::stop::ResidualNormReduction<>::build() - .with_reduction_factor(reduction_factor) - .on(exec_); - } - - std::unique_ptr::Factory> factory_; - std::shared_ptr exec_; -}; - - -TEST_F(ResidualNormReduction, CanCreateFactory) -{ - ASSERT_NE(factory_, nullptr); - ASSERT_EQ(factory_->get_parameters().reduction_factor, reduction_factor); - ASSERT_EQ(factory_->get_executor(), exec_); -} - - -TEST_F(ResidualNormReduction, CannotCreateCriterionWithoutB) -{ - ASSERT_THROW(factory_->generate(nullptr, nullptr, nullptr, nullptr), - gko::NotSupported); -} - - -TEST_F(ResidualNormReduction, CanCreateCriterionWithB) -{ - std::shared_ptr scalar = - gko::initialize>({1.0}, exec_); - auto criterion = - factory_->generate(nullptr, nullptr, nullptr, scalar.get()); - ASSERT_NE(criterion, nullptr); -} - - -TEST_F(ResidualNormReduction, WaitsTillResidualGoal) -{ - auto scalar = gko::initialize({1.0}, exec_); - auto criterion = - factory_->generate(nullptr, nullptr, nullptr, scalar.get()); - bool one_changed{}; - constexpr gko::uint8 RelativeStoppingId{1}; - gko::Array stop_status(exec_, 1); - stop_status.get_data()[0].reset(); - - ASSERT_FALSE( - criterion->update() - .residual_norm(scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - - scalar->at(0) = reduction_factor * 1.0e+2; - ASSERT_FALSE( - criterion->update() - .residual_norm(scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); - ASSERT_EQ(one_changed, false); - - scalar->at(0) = reduction_factor * 1.0e-2; - ASSERT_TRUE( - criterion->update() - .residual_norm(scalar.get()) - .check(RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); - ASSERT_EQ(one_changed, true); -} - - -TEST_F(ResidualNormReduction, WaitsTillResidualGoalMultipleRHS) -{ - auto mtx = gko::initialize({{1.0, 1.0}}, exec_); - auto criterion = factory_->generate(nullptr, nullptr, nullptr, mtx.get()); - bool one_changed{}; - constexpr gko::uint8 RelativeStoppingId{1}; - gko::Array stop_status(exec_, 2); - // Array only does malloc, it *does not* construct the object - // therefore you get undefined values in your objects whatever you do. - // Proper fix is not easy, we can't just call memset. We can probably not - // call the placement constructor either - stop_status.get_data()[0].reset(); - stop_status.get_data()[1].reset(); - - ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check( - RelativeStoppingId, true, &stop_status, &one_changed)); - - mtx->at(0, 0) = reduction_factor * 1.0e-2; - ASSERT_FALSE(criterion->update().residual_norm(mtx.get()).check( - RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[0].has_converged(), true); - ASSERT_EQ(one_changed, true); - one_changed = false; - - mtx->at(0, 1) = reduction_factor * 1.0e-2; - ASSERT_TRUE(criterion->update().residual_norm(mtx.get()).check( - RelativeStoppingId, true, &stop_status, &one_changed)); - ASSERT_EQ(stop_status.get_data()[1].has_converged(), true); - ASSERT_EQ(one_changed, true); -} - - -} // namespace diff --git a/reference/test/stop/time.cpp b/reference/test/stop/time.cpp index 8d47b90ff2a..258db5d2854 100644 --- a/reference/test/stop/time.cpp +++ b/reference/test/stop/time.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #if defined(_WIN32) || defined(__CYGWIN__) @@ -41,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#include + + namespace { diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp index 9423ce12e3d..ffa09c8d431 100644 --- a/reference/test/utils/assertions_test.cpp +++ b/reference/test/utils/assertions_test.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include "core/test/utils/assertions.hpp" #include @@ -40,16 +40,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { +template +class MatricesNear : public ::testing::Test {}; + +TYPED_TEST_CASE(MatricesNear, gko::test::ValueTypes); + -TEST(MatricesNear, CanPassAnyMatrixType) +TYPED_TEST(MatricesNear, CanPassAnyMatrixType) { auto exec = gko::ReferenceExecutor::create(); - auto mtx = gko::initialize>( + auto mtx = gko::initialize>( {{1.0, 2.0, 3.0}, {0.0, 4.0, 0.0}}, exec); - auto csr_mtx = gko::matrix::Csr<>::create(exec); + auto csr_mtx = gko::matrix::Csr::create(exec); csr_mtx->copy_from(mtx.get()); GKO_EXPECT_MTX_NEAR(csr_mtx, mtx, 0.0); diff --git a/sonar-project.properties b/sonar-project.properties index ae7b444bc32..154dd932951 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -8,4 +8,5 @@ sonar.sources=. sonar.tests=. sonar.exclusions="third_party/**, build/**" sonar.test.exclusions="benchmark/**, doc/**, examples/**" -sonar.test.inclusions="*/test/**" \ No newline at end of file +sonar.test.inclusions="*/test/**" +sonar.coverage.exclusions="third_party/**, build/**, benchmark/**, doc/**, examples/**" diff --git a/test_install/CMakeLists.txt b/test_install/CMakeLists.txt index 7eef8b8b7dd..bc7cf6b63a1 100644 --- a/test_install/CMakeLists.txt +++ b/test_install/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.8) +cmake_minimum_required(VERSION 3.9) project(TestInstall LANGUAGES CXX) @@ -7,17 +7,48 @@ find_package(Ginkgo REQUIRED # Alternatively, use `cmake -DCMAKE_PREFIX_PATH=` to specify the install directory ) -if(GINKGO_HAVE_PAPI_SDE) - find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde) -endif() - -# Needed because of a known issue with CUDA while linking statically. -# For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614 -if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA) - enable_language(CUDA) +if(MSVC) + if(GINKGO_BUILD_SHARED_LIBS) + ginkgo_switch_to_windows_dynamic("CXX") + ginkgo_switch_to_windows_dynamic("C") + else() + ginkgo_switch_to_windows_static("CXX") + ginkgo_switch_to_windows_static("C") + endif() endif() +include(CheckLanguage) +check_language(CUDA) add_executable(test_install test_install.cpp) target_compile_features(test_install PUBLIC cxx_std_11) target_link_libraries(test_install PRIVATE Ginkgo::ginkgo) + +if(GINKGO_BUILD_CUDA) + enable_language(CUDA) + if(MSVC) + if(GINKGO_BUILD_SHARED_LIBS) + ginkgo_switch_to_windows_dynamic("CUDA") + else() + ginkgo_switch_to_windows_static("CUDA") + endif() + endif() + add_executable(test_install_cuda test_install_cuda.cu) + target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo) +endif() + +if(GINKGO_BUILD_HIP + AND GINKGO_HIP_PLATFORM MATCHES "hcc" + AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL "3.5" + AND NOT GINKGO_BUILD_SHARED_LIBS) + # Compile options somehow add hip-clang specific flags. Wipe them. + # Currently, the flags wiped out should be: + # -x;hip;--hip-device-lib-path=/opt/rocm/lib;--cuda-gpu-arch=gfx900; + # --cuda-gpu-arch=gfx906 + set_target_properties(hip::device PROPERTIES INTERFACE_COMPILE_OPTIONS "") + # In addition, link libraries have a similar problem. We only keep + # `hip::host`. Currently, the flags should be: + # hip::host;--hip-device-lib-path=/opt/rocm/lib;--hip-link; + # --cuda-gpu-arch=gfx900;--cuda-gpu-arch=gfx906 + set_target_properties(hip::device PROPERTIES INTERFACE_LINK_LIBRARIES "hip::host") +endif() diff --git a/test_install/test_install.cpp b/test_install/test_install.cpp index 1479f74fcb8..5ea59794440 100644 --- a/test_install/test_install.cpp +++ b/test_install/test_install.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2019, the Ginkgo authors +Copyright (c) 2017-2020, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -196,10 +196,12 @@ int main(int, char **) auto test = gko::log::Stream<>::create(refExec); } - // core/log/convergence.hpp +#if GKO_HAVE_PAPI_SDE + // core/log/papi.hpp { - auto test = gko::log::Convergence<>::create(refExec); + auto test = gko::log::Papi<>::create(refExec); } +#endif // GKO_HAVE_PAPI_SDE // core/matrix/coo.hpp { @@ -238,6 +240,12 @@ int main(int, char **) auto test = Mtx::create(refExec); } + // core/matrix/permutation.hpp + { + using Mtx = gko::matrix::Permutation<>; + auto test = Mtx::create(refExec, gko::dim<2>{2, 2}); + } + // core/matrix/sellp.hpp { using Mtx = gko::matrix::Sellp<>; @@ -255,6 +263,12 @@ int main(int, char **) auto test = gko::preconditioner::Ilu<>::build().on(refExec); } + // core/preconditioner/isai.hpp + { + auto test_l = gko::preconditioner::LowerIsai<>::build().on(refExec); + auto test_u = gko::preconditioner::UpperIsai<>::build().on(refExec); + } + // core/preconditioner/jacobi.hpp { using Bj = gko::preconditioner::Jacobi<>; @@ -337,11 +351,20 @@ int main(int, char **) auto time = gko::stop::Time::build() .with_time_limit(std::chrono::milliseconds(10)) .on(refExec); - // residual_norm_reduction.hpp + + // residual_norm.hpp auto res_red = gko::stop::ResidualNormReduction<>::build() .with_reduction_factor(1e-10) .on(refExec); + auto rel_res = gko::stop::RelativeResidualNorm<>::build() + .with_tolerance(1e-10) + .on(refExec); + + auto abs_res = gko::stop::AbsoluteResidualNorm<>::build() + .with_tolerance(1e-10) + .on(refExec); + // stopping_status.hpp auto stop_status = gko::stopping_status{}; diff --git a/test_install/test_install_cuda.cu b/test_install/test_install_cuda.cu new file mode 100644 index 00000000000..ed2e18c307d --- /dev/null +++ b/test_install/test_install_cuda.cu @@ -0,0 +1,376 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +// core/base/polymorphic_object.hpp +class PolymorphicObjectTest : public gko::PolymorphicObject {}; + + +int main(int, char **) +{ + auto refExec = gko::ReferenceExecutor::create(); + auto cudaExec = gko::CudaExecutor::create(0, refExec); + // core/base/abstract_factory.hpp + { + using type1 = int; + using type2 = double; + static_assert( + std::is_same< + gko::AbstractFactory::abstract_product_type, + type1>::value, + "abstract_factory.hpp not included properly!"); + } + + // core/base/array.hpp + { + using type1 = int; + using ArrayType = gko::Array; + ArrayType{}; + } + + // core/base/combination.hpp + { + using type1 = int; + static_assert( + std::is_same::value_type, type1>::value, + "combination.hpp not included properly!"); + } + + // core/base/composition.hpp + { + using type1 = int; + static_assert( + std::is_same::value_type, type1>::value, + "composition.hpp not included properly"); + } + + // core/base/dim.hpp + { + using type1 = int; + gko::dim<3, type1>{4, 4, 4}; + } + + // core/base/exception.hpp + { + gko::Error(std::string("file"), 12, + std::string("Test for an error class.")); + } + + // core/base/exception_helpers.hpp + { + auto test = gko::dim<2>{3}; + GKO_ASSERT_IS_SQUARE_MATRIX(test); + } + + // core/base/executor.hpp + { + gko::ReferenceExecutor::create(); + } + + // core/base/math.hpp + { + using testType = double; + static_assert(gko::is_complex() == false, + "math.hpp not included properly!"); + } + + // core/base/matrix_data.hpp + { + gko::matrix_data<>{}; + } + + // core/base/mtx_io.hpp + { + static_assert(gko::layout_type::array != gko::layout_type::coordinate, + "mtx_io.hpp not included properly!"); + } + + // core/base/name_demangling.hpp + { + auto testVar = 3.0; + gko::name_demangling::get_static_type(testVar); + } + + + // core/base/polymorphic_object.hpp + { + gko::PolymorphicObject *test; + (void)test; // silence unused variable warning + } + + // core/base/range.hpp + { + gko::span{12}; + } + + // core/base/range_accessors.hpp + { + auto testVar = 12; + gko::range>(&testVar, 1u, + 1u, 1u); + } + + // core/base/perturbation.hpp + { + using type1 = int; + static_assert( + std::is_same::value_type, type1>::value, + "perturbation.hpp not included properly"); + } + + // core/base/std_extensions.hpp + { + static_assert(std::is_same, void>::value, + "std_extensions.hpp not included properly!"); + } + + // core/base/types.hpp + { + static_assert(gko::size_type{12} == 12, + "types.hpp not included properly"); + } + + // core/base/utils.hpp + { + gko::null_deleter{}; + } + + // core/base/version.hpp + { + gko::version_info::get().header_version; + } + + // core/factorization/par_ilu.hpp + { + gko::factorization::ParIlu<>::build().on(cudaExec); + } + + // core/log/convergence.hpp + { + gko::log::Convergence<>::create(cudaExec); + } + + // core/log/record.hpp + { + gko::log::executor_data{}; + } + + // core/log/stream.hpp + { + gko::log::Stream<>::create(cudaExec); + } + +#if GKO_HAVE_PAPI_SDE + // core/log/papi.hpp + { + gko::log::Papi<>::create(cudaExec); + } +#endif // GKO_HAVE_PAPI_SDE + + // core/matrix/coo.hpp + { + using Mtx = gko::matrix::Coo<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2); + } + + // core/matrix/csr.hpp + { + using Mtx = gko::matrix::Csr<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2, + std::make_shared(2)); + } + + // core/matrix/dense.hpp + { + using Mtx = gko::matrix::Dense<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}); + } + + // core/matrix/ell.hpp + { + using Mtx = gko::matrix::Ell<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2); + } + + // core/matrix/hybrid.hpp + { + using Mtx = gko::matrix::Hybrid<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2, 2, 1); + } + + // core/matrix/identity.hpp + { + using Mtx = gko::matrix::Identity<>; + Mtx::create(cudaExec); + } + + // core/matrix/permutation.hpp + { + using Mtx = gko::matrix::Permutation<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}); + } + + // core/matrix/sellp.hpp + { + using Mtx = gko::matrix::Sellp<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}, 2); + } + + // core/matrix/sparsity_csr.hpp + { + using Mtx = gko::matrix::SparsityCsr<>; + Mtx::create(cudaExec, gko::dim<2>{2, 2}); + } + + // core/preconditioner/ilu.hpp + { + gko::preconditioner::Ilu<>::build().on(cudaExec); + } + + // core/preconditioner/jacobi.hpp + { + using Bj = gko::preconditioner::Jacobi<>; + Bj::build().with_max_block_size(1u).on(cudaExec); + } + + // core/solver/bicgstab.hpp + { + using Solver = gko::solver::Bicgstab<>; + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec)) + .on(cudaExec); + } + + // core/solver/cg.hpp + { + using Solver = gko::solver::Cg<>; + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec)) + .on(cudaExec); + } + + // core/solver/cgs.hpp + { + using Solver = gko::solver::Cgs<>; + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec)) + .on(cudaExec); + } + + // core/solver/fcg.hpp + { + using Solver = gko::solver::Fcg<>; + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec)) + .on(cudaExec); + } + + // core/solver/gmres.hpp + { + using Solver = gko::solver::Gmres<>; + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec)) + .on(cudaExec); + } + + // core/solver/ir.hpp + { + using Solver = gko::solver::Ir<>; + Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec)) + .on(cudaExec); + } + + // core/solver/lower_trs.hpp + { + using Solver = gko::solver::LowerTrs<>; + Solver::build().on(cudaExec); + } + + // core/stop/ + { + // iteration.hpp + auto iteration = + gko::stop::Iteration::build().with_max_iters(1u).on(cudaExec); + + // time.hpp + auto time = gko::stop::Time::build() + .with_time_limit(std::chrono::milliseconds(10)) + .on(cudaExec); + + // residual_norm.hpp + gko::stop::ResidualNormReduction<>::build() + .with_reduction_factor(1e-10) + .on(cudaExec); + + gko::stop::RelativeResidualNorm<>::build() + .with_tolerance(1e-10) + .on(cudaExec); + + gko::stop::AbsoluteResidualNorm<>::build() + .with_tolerance(1e-10) + .on(cudaExec); + + // stopping_status.hpp + gko::stopping_status{}; + + // combined.hpp + auto combined = + gko::stop::Combined::build() + .with_criteria(std::move(time), std::move(iteration)) + .on(cudaExec); + } + + std::cout + << "test_install_cuda: the Ginkgo installation was correctly detected " + "and is complete." + << std::endl; + + return 0; +} diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 26eb4d1377a..884e50bf699 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -16,6 +16,8 @@ endif() if(GINKGO_DEVEL_TOOLS) set(GCF_IGNORE_LIST "third_party" CACHE STRING "Ignore directories for GCF") add_subdirectory(git-cmake-format) +else() + add_subdirectory(dummy-hook) endif() if(GINKGO_BUILD_BENCHMARKS) diff --git a/third_party/CudaArchitectureSelector/CMakeLists.txt b/third_party/CudaArchitectureSelector/CMakeLists.txt index e4ed043c539..feccda26a92 100644 --- a/third_party/CudaArchitectureSelector/CMakeLists.txt +++ b/third_party/CudaArchitectureSelector/CMakeLists.txt @@ -1,6 +1,6 @@ ginkgo_load_git_package(CudaArchitectureSelector "https://github.com/ginkgo-project/CudaArchitectureSelector.git" - "0b46fb7d653404db312cbc1fc702cb528fd1c1b0") + "f6e024cc2000eb870dc52166d4cdce9fe7f9a7a4") add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/build EXCLUDE_FROM_ALL) set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE) diff --git a/third_party/dummy-hook/CMakeLists.txt b/third_party/dummy-hook/CMakeLists.txt new file mode 100644 index 00000000000..043ae8da917 --- /dev/null +++ b/third_party/dummy-hook/CMakeLists.txt @@ -0,0 +1,18 @@ +if(EXISTS "${Ginkgo_SOURCE_DIR}/.git") + set(ADD_HOOK FALSE) + set(HOOK_LOCATION "${Ginkgo_SOURCE_DIR}/.git/hooks/pre-commit") + if(NOT EXISTS "${HOOK_LOCATION}") + set(ADD_HOOK TRUE) + else() + # check if the correct hook is installed + execute_process(COMMAND grep git-cmake-format.py "${HOOK_LOCATION}" + RESULT_VARIABLE res OUTPUT_QUIET) + # return value =/= 0 means the pattern was not found + if(NOT res EQUAL 0) + set(ADD_HOOK TRUE) + endif() + endif() + if(ADD_HOOK) + configure_file(dummy_hook "${HOOK_LOCATION}" COPYONLY) + endif() +endif() diff --git a/third_party/dummy-hook/dummy_hook b/third_party/dummy-hook/dummy_hook new file mode 100755 index 00000000000..4274cb3e418 --- /dev/null +++ b/third_party/dummy-hook/dummy_hook @@ -0,0 +1,5 @@ +#!/bin/bash +echo "Please only commit to Ginkgo when GINKGO_DEVEL_TOOLS is enabled in CMake." +echo "This can be set in your initial invocation of CMake by using" +echo " -DGINKGO_DEVEL_TOOLS=ON or by editing the CMakeCache.txt file." +exit 1 diff --git a/third_party/gflags/CMakeLists.txt b/third_party/gflags/CMakeLists.txt index 5581483cf0d..048e3343004 100644 --- a/third_party/gflags/CMakeLists.txt +++ b/third_party/gflags/CMakeLists.txt @@ -3,23 +3,33 @@ if(MSVC) # use the ginkgo's flags to use the same runtime libraries as ginkgo ginkgo_load_git_package(gflags_external "https://github.com/gflags/gflags.git" - "0b7f8db2c6b1b0b2451da0923a9ab09cc610e8d1" + "f7388c6655e699f777a5a74a3c9880b9cfaabe59" "-DGFLAGS_BUILD_TESTING=OFF" "-DGFLAGS_BUILD_gflags_LIB=OFF" "-DGFLAGS_BUILD_gflags_nothreads_LIB=ON" "-DGFLAGS_BUILD_STATIC_LIBS=ON" "-DGFLAGS_BUILD_PACKAGING=OFF" "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}") else() + # There is a weird issue with Intel 19 and c++17 causing a linking error. + # Use c++11 instead. + set(INTEL19_STD_FIX "") + if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND CMAKE_CXX_COMPILER_VERSION MATCHES "19") + set(INTEL19_STD_FIX "-DCMAKE_CXX_FLAGS=-std=c++11") + endif() ginkgo_load_git_package(gflags_external "https://github.com/gflags/gflags.git" - "0b7f8db2c6b1b0b2451da0923a9ab09cc610e8d1" + "f7388c6655e699f777a5a74a3c9880b9cfaabe59" "-DGFLAGS_BUILD_TESTING=OFF" "-DGFLAGS_BUILD_gflags_LIB=OFF" "-DGFLAGS_BUILD_gflags_nothreads_LIB=ON" "-DGFLAGS_BUILD_STATIC_LIBS=ON" - "-DGFLAGS_BUILD_PACKAGING=OFF") + "-DGFLAGS_BUILD_PACKAGING=OFF" + "${INTEL19_STD_FIX}" + ) endif() if(WIN32) # gflags uses gflags_nothreads_static not gflags_nothreads_static in Windows. - ginkgo_add_external_target(gflags gflags_nothreads_static build/include build/lib STATIC "_debug" gflags_external FALSE) + ginkgo_add_external_target(gflags gflags_nothreads_static build/include build/lib + STATIC "_debug" gflags_external FALSE) else() - ginkgo_add_external_target(gflags gflags_nothreads build/include build/lib STATIC "_debug" gflags_external FALSE) + ginkgo_add_external_target(gflags gflags_nothreads build/include build/lib + STATIC "_debug" gflags_external FALSE) endif() diff --git a/third_party/git-cmake-format/CMakeLists.txt b/third_party/git-cmake-format/CMakeLists.txt index c05253a738c..b8e3d623050 100644 --- a/third_party/git-cmake-format/CMakeLists.txt +++ b/third_party/git-cmake-format/CMakeLists.txt @@ -1,5 +1,6 @@ ginkgo_load_git_package(git-cmake-format "https://github.com/ginkgo-project/git-cmake-format.git" - "e19ab13e640d58abd3bfdbff5f77b499b2ec4169") + "29c23665d624e1cae1308bec651706fdaa8fe38b" + "-DGCF_CLANGFORMAT_MINIMAL_VERSION=5.0.0") add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/build EXCLUDE_FROM_ALL) diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt index ac9da49d307..48976e93f69 100644 --- a/third_party/gtest/CMakeLists.txt +++ b/third_party/gtest/CMakeLists.txt @@ -10,7 +10,8 @@ else() "https://github.com/google/googletest.git" "df428ec11891f12c81e2872c0432e342b5403a34" # Work around the linking errors when compiling gtest with CUDA - "-Dgtest_disable_pthreads=ON") + "-Dgtest_disable_pthreads=ON" + "-DCMAKE_CXX_FLAGS=-fPIC") endif() ginkgo_add_external_target(GTest::GTest gtest src/googletest/include build/googlemock/gtest diff --git a/third_party/rapidjson/CMakeLists.txt b/third_party/rapidjson/CMakeLists.txt index a3d8a20af36..bd2ae7899b1 100644 --- a/third_party/rapidjson/CMakeLists.txt +++ b/third_party/rapidjson/CMakeLists.txt @@ -1,6 +1,6 @@ ginkgo_load_git_package(rapidjson_external "https://github.com/Tencent/rapidjson.git" - "6a6bed2759d42891f9e29a37b21315d3192890ed" + "88bd956d66d348f478bceebfdadb8e26c6844695" "-DRAPIDJSON_BUILD_DOC=OFF" "-DRAPIDJSON_BUILD_EXAMPLES=OFF" "-DRAPIDJSON_BUILD_TESTS=OFF" "-DRAPIDJSON_BUILD_THIRDPARTY_GTEST=OFF" "-DRAPIDJSON_BUILD_CXX11=ON")